Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.24-rc6 2092 lines 49 kB view raw
1/* 2 * net/sched/sch_cbq.c Class-Based Queueing discipline. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * 11 */ 12 13#include <linux/module.h> 14#include <linux/types.h> 15#include <linux/kernel.h> 16#include <linux/string.h> 17#include <linux/errno.h> 18#include <linux/skbuff.h> 19#include <net/netlink.h> 20#include <net/pkt_sched.h> 21 22 23/* Class-Based Queueing (CBQ) algorithm. 24 ======================================= 25 26 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource 27 Management Models for Packet Networks", 28 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 29 30 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 31 32 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting 33 Parameters", 1996 34 35 [4] Sally Floyd and Michael Speer, "Experimental Results 36 for Class-Based Queueing", 1998, not published. 37 38 ----------------------------------------------------------------------- 39 40 Algorithm skeleton was taken from NS simulator cbq.cc. 41 If someone wants to check this code against the LBL version, 42 he should take into account that ONLY the skeleton was borrowed, 43 the implementation is different. Particularly: 44 45 --- The WRR algorithm is different. Our version looks more 46 reasonable (I hope) and works when quanta are allowed to be 47 less than MTU, which is always the case when real time classes 48 have small rates. Note, that the statement of [3] is 49 incomplete, delay may actually be estimated even if class 50 per-round allotment is less than MTU. Namely, if per-round 51 allotment is W*r_i, and r_1+...+r_k = r < 1 52 53 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B 54 55 In the worst case we have IntServ estimate with D = W*r+k*MTU 56 and C = MTU*r. The proof (if correct at all) is trivial. 57 58 59 --- It seems that cbq-2.0 is not very accurate. At least, I cannot 60 interpret some places, which look like wrong translations 61 from NS. Anyone is advised to find these differences 62 and explain to me, why I am wrong 8). 63 64 --- Linux has no EOI event, so that we cannot estimate true class 65 idle time. Workaround is to consider the next dequeue event 66 as sign that previous packet is finished. This is wrong because of 67 internal device queueing, but on a permanently loaded link it is true. 68 Moreover, combined with clock integrator, this scheme looks 69 very close to an ideal solution. */ 70 71struct cbq_sched_data; 72 73 74struct cbq_class 75{ 76 struct cbq_class *next; /* hash table link */ 77 struct cbq_class *next_alive; /* next class with backlog in this priority band */ 78 79/* Parameters */ 80 u32 classid; 81 unsigned char priority; /* class priority */ 82 unsigned char priority2; /* priority to be used after overlimit */ 83 unsigned char ewma_log; /* time constant for idle time calculation */ 84 unsigned char ovl_strategy; 85#ifdef CONFIG_NET_CLS_ACT 86 unsigned char police; 87#endif 88 89 u32 defmap; 90 91 /* Link-sharing scheduler parameters */ 92 long maxidle; /* Class parameters: see below. */ 93 long offtime; 94 long minidle; 95 u32 avpkt; 96 struct qdisc_rate_table *R_tab; 97 98 /* Overlimit strategy parameters */ 99 void (*overlimit)(struct cbq_class *cl); 100 psched_tdiff_t penalty; 101 102 /* General scheduler (WRR) parameters */ 103 long allot; 104 long quantum; /* Allotment per WRR round */ 105 long weight; /* Relative allotment: see below */ 106 107 struct Qdisc *qdisc; /* Ptr to CBQ discipline */ 108 struct cbq_class *split; /* Ptr to split node */ 109 struct cbq_class *share; /* Ptr to LS parent in the class tree */ 110 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ 111 struct cbq_class *borrow; /* NULL if class is bandwidth limited; 112 parent otherwise */ 113 struct cbq_class *sibling; /* Sibling chain */ 114 struct cbq_class *children; /* Pointer to children chain */ 115 116 struct Qdisc *q; /* Elementary queueing discipline */ 117 118 119/* Variables */ 120 unsigned char cpriority; /* Effective priority */ 121 unsigned char delayed; 122 unsigned char level; /* level of the class in hierarchy: 123 0 for leaf classes, and maximal 124 level of children + 1 for nodes. 125 */ 126 127 psched_time_t last; /* Last end of service */ 128 psched_time_t undertime; 129 long avgidle; 130 long deficit; /* Saved deficit for WRR */ 131 psched_time_t penalized; 132 struct gnet_stats_basic bstats; 133 struct gnet_stats_queue qstats; 134 struct gnet_stats_rate_est rate_est; 135 struct tc_cbq_xstats xstats; 136 137 struct tcf_proto *filter_list; 138 139 int refcnt; 140 int filters; 141 142 struct cbq_class *defaults[TC_PRIO_MAX+1]; 143}; 144 145struct cbq_sched_data 146{ 147 struct cbq_class *classes[16]; /* Hash table of all classes */ 148 int nclasses[TC_CBQ_MAXPRIO+1]; 149 unsigned quanta[TC_CBQ_MAXPRIO+1]; 150 151 struct cbq_class link; 152 153 unsigned activemask; 154 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes 155 with backlog */ 156 157#ifdef CONFIG_NET_CLS_ACT 158 struct cbq_class *rx_class; 159#endif 160 struct cbq_class *tx_class; 161 struct cbq_class *tx_borrowed; 162 int tx_len; 163 psched_time_t now; /* Cached timestamp */ 164 psched_time_t now_rt; /* Cached real time */ 165 unsigned pmask; 166 167 struct hrtimer delay_timer; 168 struct qdisc_watchdog watchdog; /* Watchdog timer, 169 started when CBQ has 170 backlog, but cannot 171 transmit just now */ 172 psched_tdiff_t wd_expires; 173 int toplevel; 174 u32 hgenerator; 175}; 176 177 178#define L2T(cl,len) qdisc_l2t((cl)->R_tab,len) 179 180 181static __inline__ unsigned cbq_hash(u32 h) 182{ 183 h ^= h>>8; 184 h ^= h>>4; 185 return h&0xF; 186} 187 188static __inline__ struct cbq_class * 189cbq_class_lookup(struct cbq_sched_data *q, u32 classid) 190{ 191 struct cbq_class *cl; 192 193 for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next) 194 if (cl->classid == classid) 195 return cl; 196 return NULL; 197} 198 199#ifdef CONFIG_NET_CLS_ACT 200 201static struct cbq_class * 202cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) 203{ 204 struct cbq_class *cl, *new; 205 206 for (cl = this->tparent; cl; cl = cl->tparent) 207 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) 208 return new; 209 210 return NULL; 211} 212 213#endif 214 215/* Classify packet. The procedure is pretty complicated, but 216 it allows us to combine link sharing and priority scheduling 217 transparently. 218 219 Namely, you can put link sharing rules (f.e. route based) at root of CBQ, 220 so that it resolves to split nodes. Then packets are classified 221 by logical priority, or a more specific classifier may be attached 222 to the split node. 223 */ 224 225static struct cbq_class * 226cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) 227{ 228 struct cbq_sched_data *q = qdisc_priv(sch); 229 struct cbq_class *head = &q->link; 230 struct cbq_class **defmap; 231 struct cbq_class *cl = NULL; 232 u32 prio = skb->priority; 233 struct tcf_result res; 234 235 /* 236 * Step 1. If skb->priority points to one of our classes, use it. 237 */ 238 if (TC_H_MAJ(prio^sch->handle) == 0 && 239 (cl = cbq_class_lookup(q, prio)) != NULL) 240 return cl; 241 242 *qerr = NET_XMIT_BYPASS; 243 for (;;) { 244 int result = 0; 245 defmap = head->defaults; 246 247 /* 248 * Step 2+n. Apply classifier. 249 */ 250 if (!head->filter_list || 251 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) 252 goto fallback; 253 254 if ((cl = (void*)res.class) == NULL) { 255 if (TC_H_MAJ(res.classid)) 256 cl = cbq_class_lookup(q, res.classid); 257 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) 258 cl = defmap[TC_PRIO_BESTEFFORT]; 259 260 if (cl == NULL || cl->level >= head->level) 261 goto fallback; 262 } 263 264#ifdef CONFIG_NET_CLS_ACT 265 switch (result) { 266 case TC_ACT_QUEUED: 267 case TC_ACT_STOLEN: 268 *qerr = NET_XMIT_SUCCESS; 269 case TC_ACT_SHOT: 270 return NULL; 271 case TC_ACT_RECLASSIFY: 272 return cbq_reclassify(skb, cl); 273 } 274#endif 275 if (cl->level == 0) 276 return cl; 277 278 /* 279 * Step 3+n. If classifier selected a link sharing class, 280 * apply agency specific classifier. 281 * Repeat this procdure until we hit a leaf node. 282 */ 283 head = cl; 284 } 285 286fallback: 287 cl = head; 288 289 /* 290 * Step 4. No success... 291 */ 292 if (TC_H_MAJ(prio) == 0 && 293 !(cl = head->defaults[prio&TC_PRIO_MAX]) && 294 !(cl = head->defaults[TC_PRIO_BESTEFFORT])) 295 return head; 296 297 return cl; 298} 299 300/* 301 A packet has just been enqueued on the empty class. 302 cbq_activate_class adds it to the tail of active class list 303 of its priority band. 304 */ 305 306static __inline__ void cbq_activate_class(struct cbq_class *cl) 307{ 308 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 309 int prio = cl->cpriority; 310 struct cbq_class *cl_tail; 311 312 cl_tail = q->active[prio]; 313 q->active[prio] = cl; 314 315 if (cl_tail != NULL) { 316 cl->next_alive = cl_tail->next_alive; 317 cl_tail->next_alive = cl; 318 } else { 319 cl->next_alive = cl; 320 q->activemask |= (1<<prio); 321 } 322} 323 324/* 325 Unlink class from active chain. 326 Note that this same procedure is done directly in cbq_dequeue* 327 during round-robin procedure. 328 */ 329 330static void cbq_deactivate_class(struct cbq_class *this) 331{ 332 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 333 int prio = this->cpriority; 334 struct cbq_class *cl; 335 struct cbq_class *cl_prev = q->active[prio]; 336 337 do { 338 cl = cl_prev->next_alive; 339 if (cl == this) { 340 cl_prev->next_alive = cl->next_alive; 341 cl->next_alive = NULL; 342 343 if (cl == q->active[prio]) { 344 q->active[prio] = cl_prev; 345 if (cl == q->active[prio]) { 346 q->active[prio] = NULL; 347 q->activemask &= ~(1<<prio); 348 return; 349 } 350 } 351 return; 352 } 353 } while ((cl_prev = cl) != q->active[prio]); 354} 355 356static void 357cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) 358{ 359 int toplevel = q->toplevel; 360 361 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { 362 psched_time_t now; 363 psched_tdiff_t incr; 364 365 now = psched_get_time(); 366 incr = now - q->now_rt; 367 now = q->now + incr; 368 369 do { 370 if (cl->undertime < now) { 371 q->toplevel = cl->level; 372 return; 373 } 374 } while ((cl=cl->borrow) != NULL && toplevel > cl->level); 375 } 376} 377 378static int 379cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 380{ 381 struct cbq_sched_data *q = qdisc_priv(sch); 382 int len = skb->len; 383 int uninitialized_var(ret); 384 struct cbq_class *cl = cbq_classify(skb, sch, &ret); 385 386#ifdef CONFIG_NET_CLS_ACT 387 q->rx_class = cl; 388#endif 389 if (cl == NULL) { 390 if (ret == NET_XMIT_BYPASS) 391 sch->qstats.drops++; 392 kfree_skb(skb); 393 return ret; 394 } 395 396#ifdef CONFIG_NET_CLS_ACT 397 cl->q->__parent = sch; 398#endif 399 if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) { 400 sch->q.qlen++; 401 sch->bstats.packets++; 402 sch->bstats.bytes+=len; 403 cbq_mark_toplevel(q, cl); 404 if (!cl->next_alive) 405 cbq_activate_class(cl); 406 return ret; 407 } 408 409 sch->qstats.drops++; 410 cbq_mark_toplevel(q, cl); 411 cl->qstats.drops++; 412 return ret; 413} 414 415static int 416cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) 417{ 418 struct cbq_sched_data *q = qdisc_priv(sch); 419 struct cbq_class *cl; 420 int ret; 421 422 if ((cl = q->tx_class) == NULL) { 423 kfree_skb(skb); 424 sch->qstats.drops++; 425 return NET_XMIT_CN; 426 } 427 q->tx_class = NULL; 428 429 cbq_mark_toplevel(q, cl); 430 431#ifdef CONFIG_NET_CLS_ACT 432 q->rx_class = cl; 433 cl->q->__parent = sch; 434#endif 435 if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) { 436 sch->q.qlen++; 437 sch->qstats.requeues++; 438 if (!cl->next_alive) 439 cbq_activate_class(cl); 440 return 0; 441 } 442 sch->qstats.drops++; 443 cl->qstats.drops++; 444 return ret; 445} 446 447/* Overlimit actions */ 448 449/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ 450 451static void cbq_ovl_classic(struct cbq_class *cl) 452{ 453 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 454 psched_tdiff_t delay = cl->undertime - q->now; 455 456 if (!cl->delayed) { 457 delay += cl->offtime; 458 459 /* 460 Class goes to sleep, so that it will have no 461 chance to work avgidle. Let's forgive it 8) 462 463 BTW cbq-2.0 has a crap in this 464 place, apparently they forgot to shift it by cl->ewma_log. 465 */ 466 if (cl->avgidle < 0) 467 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); 468 if (cl->avgidle < cl->minidle) 469 cl->avgidle = cl->minidle; 470 if (delay <= 0) 471 delay = 1; 472 cl->undertime = q->now + delay; 473 474 cl->xstats.overactions++; 475 cl->delayed = 1; 476 } 477 if (q->wd_expires == 0 || q->wd_expires > delay) 478 q->wd_expires = delay; 479 480 /* Dirty work! We must schedule wakeups based on 481 real available rate, rather than leaf rate, 482 which may be tiny (even zero). 483 */ 484 if (q->toplevel == TC_CBQ_MAXLEVEL) { 485 struct cbq_class *b; 486 psched_tdiff_t base_delay = q->wd_expires; 487 488 for (b = cl->borrow; b; b = b->borrow) { 489 delay = b->undertime - q->now; 490 if (delay < base_delay) { 491 if (delay <= 0) 492 delay = 1; 493 base_delay = delay; 494 } 495 } 496 497 q->wd_expires = base_delay; 498 } 499} 500 501/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when 502 they go overlimit 503 */ 504 505static void cbq_ovl_rclassic(struct cbq_class *cl) 506{ 507 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 508 struct cbq_class *this = cl; 509 510 do { 511 if (cl->level > q->toplevel) { 512 cl = NULL; 513 break; 514 } 515 } while ((cl = cl->borrow) != NULL); 516 517 if (cl == NULL) 518 cl = this; 519 cbq_ovl_classic(cl); 520} 521 522/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ 523 524static void cbq_ovl_delay(struct cbq_class *cl) 525{ 526 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 527 psched_tdiff_t delay = cl->undertime - q->now; 528 529 if (!cl->delayed) { 530 psched_time_t sched = q->now; 531 ktime_t expires; 532 533 delay += cl->offtime; 534 if (cl->avgidle < 0) 535 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); 536 if (cl->avgidle < cl->minidle) 537 cl->avgidle = cl->minidle; 538 cl->undertime = q->now + delay; 539 540 if (delay > 0) { 541 sched += delay + cl->penalty; 542 cl->penalized = sched; 543 cl->cpriority = TC_CBQ_MAXPRIO; 544 q->pmask |= (1<<TC_CBQ_MAXPRIO); 545 546 expires = ktime_set(0, 0); 547 expires = ktime_add_ns(expires, PSCHED_US2NS(sched)); 548 if (hrtimer_try_to_cancel(&q->delay_timer) && 549 ktime_to_ns(ktime_sub(q->delay_timer.expires, 550 expires)) > 0) 551 q->delay_timer.expires = expires; 552 hrtimer_restart(&q->delay_timer); 553 cl->delayed = 1; 554 cl->xstats.overactions++; 555 return; 556 } 557 delay = 1; 558 } 559 if (q->wd_expires == 0 || q->wd_expires > delay) 560 q->wd_expires = delay; 561} 562 563/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ 564 565static void cbq_ovl_lowprio(struct cbq_class *cl) 566{ 567 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 568 569 cl->penalized = q->now + cl->penalty; 570 571 if (cl->cpriority != cl->priority2) { 572 cl->cpriority = cl->priority2; 573 q->pmask |= (1<<cl->cpriority); 574 cl->xstats.overactions++; 575 } 576 cbq_ovl_classic(cl); 577} 578 579/* TC_CBQ_OVL_DROP: penalize class by dropping */ 580 581static void cbq_ovl_drop(struct cbq_class *cl) 582{ 583 if (cl->q->ops->drop) 584 if (cl->q->ops->drop(cl->q)) 585 cl->qdisc->q.qlen--; 586 cl->xstats.overactions++; 587 cbq_ovl_classic(cl); 588} 589 590static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio, 591 psched_time_t now) 592{ 593 struct cbq_class *cl; 594 struct cbq_class *cl_prev = q->active[prio]; 595 psched_time_t sched = now; 596 597 if (cl_prev == NULL) 598 return 0; 599 600 do { 601 cl = cl_prev->next_alive; 602 if (now - cl->penalized > 0) { 603 cl_prev->next_alive = cl->next_alive; 604 cl->next_alive = NULL; 605 cl->cpriority = cl->priority; 606 cl->delayed = 0; 607 cbq_activate_class(cl); 608 609 if (cl == q->active[prio]) { 610 q->active[prio] = cl_prev; 611 if (cl == q->active[prio]) { 612 q->active[prio] = NULL; 613 return 0; 614 } 615 } 616 617 cl = cl_prev->next_alive; 618 } else if (sched - cl->penalized > 0) 619 sched = cl->penalized; 620 } while ((cl_prev = cl) != q->active[prio]); 621 622 return sched - now; 623} 624 625static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) 626{ 627 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data, 628 delay_timer); 629 struct Qdisc *sch = q->watchdog.qdisc; 630 psched_time_t now; 631 psched_tdiff_t delay = 0; 632 unsigned pmask; 633 634 now = psched_get_time(); 635 636 pmask = q->pmask; 637 q->pmask = 0; 638 639 while (pmask) { 640 int prio = ffz(~pmask); 641 psched_tdiff_t tmp; 642 643 pmask &= ~(1<<prio); 644 645 tmp = cbq_undelay_prio(q, prio, now); 646 if (tmp > 0) { 647 q->pmask |= 1<<prio; 648 if (tmp < delay || delay == 0) 649 delay = tmp; 650 } 651 } 652 653 if (delay) { 654 ktime_t time; 655 656 time = ktime_set(0, 0); 657 time = ktime_add_ns(time, PSCHED_US2NS(now + delay)); 658 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); 659 } 660 661 sch->flags &= ~TCQ_F_THROTTLED; 662 netif_schedule(sch->dev); 663 return HRTIMER_NORESTART; 664} 665 666#ifdef CONFIG_NET_CLS_ACT 667static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) 668{ 669 int len = skb->len; 670 struct Qdisc *sch = child->__parent; 671 struct cbq_sched_data *q = qdisc_priv(sch); 672 struct cbq_class *cl = q->rx_class; 673 674 q->rx_class = NULL; 675 676 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { 677 678 cbq_mark_toplevel(q, cl); 679 680 q->rx_class = cl; 681 cl->q->__parent = sch; 682 683 if (cl->q->enqueue(skb, cl->q) == 0) { 684 sch->q.qlen++; 685 sch->bstats.packets++; 686 sch->bstats.bytes+=len; 687 if (!cl->next_alive) 688 cbq_activate_class(cl); 689 return 0; 690 } 691 sch->qstats.drops++; 692 return 0; 693 } 694 695 sch->qstats.drops++; 696 return -1; 697} 698#endif 699 700/* 701 It is mission critical procedure. 702 703 We "regenerate" toplevel cutoff, if transmitting class 704 has backlog and it is not regulated. It is not part of 705 original CBQ description, but looks more reasonable. 706 Probably, it is wrong. This question needs further investigation. 707*/ 708 709static __inline__ void 710cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, 711 struct cbq_class *borrowed) 712{ 713 if (cl && q->toplevel >= borrowed->level) { 714 if (cl->q->q.qlen > 1) { 715 do { 716 if (borrowed->undertime == PSCHED_PASTPERFECT) { 717 q->toplevel = borrowed->level; 718 return; 719 } 720 } while ((borrowed=borrowed->borrow) != NULL); 721 } 722#if 0 723 /* It is not necessary now. Uncommenting it 724 will save CPU cycles, but decrease fairness. 725 */ 726 q->toplevel = TC_CBQ_MAXLEVEL; 727#endif 728 } 729} 730 731static void 732cbq_update(struct cbq_sched_data *q) 733{ 734 struct cbq_class *this = q->tx_class; 735 struct cbq_class *cl = this; 736 int len = q->tx_len; 737 738 q->tx_class = NULL; 739 740 for ( ; cl; cl = cl->share) { 741 long avgidle = cl->avgidle; 742 long idle; 743 744 cl->bstats.packets++; 745 cl->bstats.bytes += len; 746 747 /* 748 (now - last) is total time between packet right edges. 749 (last_pktlen/rate) is "virtual" busy time, so that 750 751 idle = (now - last) - last_pktlen/rate 752 */ 753 754 idle = q->now - cl->last; 755 if ((unsigned long)idle > 128*1024*1024) { 756 avgidle = cl->maxidle; 757 } else { 758 idle -= L2T(cl, len); 759 760 /* true_avgidle := (1-W)*true_avgidle + W*idle, 761 where W=2^{-ewma_log}. But cl->avgidle is scaled: 762 cl->avgidle == true_avgidle/W, 763 hence: 764 */ 765 avgidle += idle - (avgidle>>cl->ewma_log); 766 } 767 768 if (avgidle <= 0) { 769 /* Overlimit or at-limit */ 770 771 if (avgidle < cl->minidle) 772 avgidle = cl->minidle; 773 774 cl->avgidle = avgidle; 775 776 /* Calculate expected time, when this class 777 will be allowed to send. 778 It will occur, when: 779 (1-W)*true_avgidle + W*delay = 0, i.e. 780 idle = (1/W - 1)*(-true_avgidle) 781 or 782 idle = (1 - W)*(-cl->avgidle); 783 */ 784 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); 785 786 /* 787 That is not all. 788 To maintain the rate allocated to the class, 789 we add to undertime virtual clock, 790 necessary to complete transmitted packet. 791 (len/phys_bandwidth has been already passed 792 to the moment of cbq_update) 793 */ 794 795 idle -= L2T(&q->link, len); 796 idle += L2T(cl, len); 797 798 cl->undertime = q->now + idle; 799 } else { 800 /* Underlimit */ 801 802 cl->undertime = PSCHED_PASTPERFECT; 803 if (avgidle > cl->maxidle) 804 cl->avgidle = cl->maxidle; 805 else 806 cl->avgidle = avgidle; 807 } 808 cl->last = q->now; 809 } 810 811 cbq_update_toplevel(q, this, q->tx_borrowed); 812} 813 814static __inline__ struct cbq_class * 815cbq_under_limit(struct cbq_class *cl) 816{ 817 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 818 struct cbq_class *this_cl = cl; 819 820 if (cl->tparent == NULL) 821 return cl; 822 823 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) { 824 cl->delayed = 0; 825 return cl; 826 } 827 828 do { 829 /* It is very suspicious place. Now overlimit 830 action is generated for not bounded classes 831 only if link is completely congested. 832 Though it is in agree with ancestor-only paradigm, 833 it looks very stupid. Particularly, 834 it means that this chunk of code will either 835 never be called or result in strong amplification 836 of burstiness. Dangerous, silly, and, however, 837 no another solution exists. 838 */ 839 if ((cl = cl->borrow) == NULL) { 840 this_cl->qstats.overlimits++; 841 this_cl->overlimit(this_cl); 842 return NULL; 843 } 844 if (cl->level > q->toplevel) 845 return NULL; 846 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime); 847 848 cl->delayed = 0; 849 return cl; 850} 851 852static __inline__ struct sk_buff * 853cbq_dequeue_prio(struct Qdisc *sch, int prio) 854{ 855 struct cbq_sched_data *q = qdisc_priv(sch); 856 struct cbq_class *cl_tail, *cl_prev, *cl; 857 struct sk_buff *skb; 858 int deficit; 859 860 cl_tail = cl_prev = q->active[prio]; 861 cl = cl_prev->next_alive; 862 863 do { 864 deficit = 0; 865 866 /* Start round */ 867 do { 868 struct cbq_class *borrow = cl; 869 870 if (cl->q->q.qlen && 871 (borrow = cbq_under_limit(cl)) == NULL) 872 goto skip_class; 873 874 if (cl->deficit <= 0) { 875 /* Class exhausted its allotment per 876 this round. Switch to the next one. 877 */ 878 deficit = 1; 879 cl->deficit += cl->quantum; 880 goto next_class; 881 } 882 883 skb = cl->q->dequeue(cl->q); 884 885 /* Class did not give us any skb :-( 886 It could occur even if cl->q->q.qlen != 0 887 f.e. if cl->q == "tbf" 888 */ 889 if (skb == NULL) 890 goto skip_class; 891 892 cl->deficit -= skb->len; 893 q->tx_class = cl; 894 q->tx_borrowed = borrow; 895 if (borrow != cl) { 896#ifndef CBQ_XSTATS_BORROWS_BYTES 897 borrow->xstats.borrows++; 898 cl->xstats.borrows++; 899#else 900 borrow->xstats.borrows += skb->len; 901 cl->xstats.borrows += skb->len; 902#endif 903 } 904 q->tx_len = skb->len; 905 906 if (cl->deficit <= 0) { 907 q->active[prio] = cl; 908 cl = cl->next_alive; 909 cl->deficit += cl->quantum; 910 } 911 return skb; 912 913skip_class: 914 if (cl->q->q.qlen == 0 || prio != cl->cpriority) { 915 /* Class is empty or penalized. 916 Unlink it from active chain. 917 */ 918 cl_prev->next_alive = cl->next_alive; 919 cl->next_alive = NULL; 920 921 /* Did cl_tail point to it? */ 922 if (cl == cl_tail) { 923 /* Repair it! */ 924 cl_tail = cl_prev; 925 926 /* Was it the last class in this band? */ 927 if (cl == cl_tail) { 928 /* Kill the band! */ 929 q->active[prio] = NULL; 930 q->activemask &= ~(1<<prio); 931 if (cl->q->q.qlen) 932 cbq_activate_class(cl); 933 return NULL; 934 } 935 936 q->active[prio] = cl_tail; 937 } 938 if (cl->q->q.qlen) 939 cbq_activate_class(cl); 940 941 cl = cl_prev; 942 } 943 944next_class: 945 cl_prev = cl; 946 cl = cl->next_alive; 947 } while (cl_prev != cl_tail); 948 } while (deficit); 949 950 q->active[prio] = cl_prev; 951 952 return NULL; 953} 954 955static __inline__ struct sk_buff * 956cbq_dequeue_1(struct Qdisc *sch) 957{ 958 struct cbq_sched_data *q = qdisc_priv(sch); 959 struct sk_buff *skb; 960 unsigned activemask; 961 962 activemask = q->activemask&0xFF; 963 while (activemask) { 964 int prio = ffz(~activemask); 965 activemask &= ~(1<<prio); 966 skb = cbq_dequeue_prio(sch, prio); 967 if (skb) 968 return skb; 969 } 970 return NULL; 971} 972 973static struct sk_buff * 974cbq_dequeue(struct Qdisc *sch) 975{ 976 struct sk_buff *skb; 977 struct cbq_sched_data *q = qdisc_priv(sch); 978 psched_time_t now; 979 psched_tdiff_t incr; 980 981 now = psched_get_time(); 982 incr = now - q->now_rt; 983 984 if (q->tx_class) { 985 psched_tdiff_t incr2; 986 /* Time integrator. We calculate EOS time 987 by adding expected packet transmission time. 988 If real time is greater, we warp artificial clock, 989 so that: 990 991 cbq_time = max(real_time, work); 992 */ 993 incr2 = L2T(&q->link, q->tx_len); 994 q->now += incr2; 995 cbq_update(q); 996 if ((incr -= incr2) < 0) 997 incr = 0; 998 } 999 q->now += incr; 1000 q->now_rt = now; 1001 1002 for (;;) { 1003 q->wd_expires = 0; 1004 1005 skb = cbq_dequeue_1(sch); 1006 if (skb) { 1007 sch->q.qlen--; 1008 sch->flags &= ~TCQ_F_THROTTLED; 1009 return skb; 1010 } 1011 1012 /* All the classes are overlimit. 1013 1014 It is possible, if: 1015 1016 1. Scheduler is empty. 1017 2. Toplevel cutoff inhibited borrowing. 1018 3. Root class is overlimit. 1019 1020 Reset 2d and 3d conditions and retry. 1021 1022 Note, that NS and cbq-2.0 are buggy, peeking 1023 an arbitrary class is appropriate for ancestor-only 1024 sharing, but not for toplevel algorithm. 1025 1026 Our version is better, but slower, because it requires 1027 two passes, but it is unavoidable with top-level sharing. 1028 */ 1029 1030 if (q->toplevel == TC_CBQ_MAXLEVEL && 1031 q->link.undertime == PSCHED_PASTPERFECT) 1032 break; 1033 1034 q->toplevel = TC_CBQ_MAXLEVEL; 1035 q->link.undertime = PSCHED_PASTPERFECT; 1036 } 1037 1038 /* No packets in scheduler or nobody wants to give them to us :-( 1039 Sigh... start watchdog timer in the last case. */ 1040 1041 if (sch->q.qlen) { 1042 sch->qstats.overlimits++; 1043 if (q->wd_expires) 1044 qdisc_watchdog_schedule(&q->watchdog, 1045 now + q->wd_expires); 1046 } 1047 return NULL; 1048} 1049 1050/* CBQ class maintanance routines */ 1051 1052static void cbq_adjust_levels(struct cbq_class *this) 1053{ 1054 if (this == NULL) 1055 return; 1056 1057 do { 1058 int level = 0; 1059 struct cbq_class *cl; 1060 1061 if ((cl = this->children) != NULL) { 1062 do { 1063 if (cl->level > level) 1064 level = cl->level; 1065 } while ((cl = cl->sibling) != this->children); 1066 } 1067 this->level = level+1; 1068 } while ((this = this->tparent) != NULL); 1069} 1070 1071static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) 1072{ 1073 struct cbq_class *cl; 1074 unsigned h; 1075 1076 if (q->quanta[prio] == 0) 1077 return; 1078 1079 for (h=0; h<16; h++) { 1080 for (cl = q->classes[h]; cl; cl = cl->next) { 1081 /* BUGGGG... Beware! This expression suffer of 1082 arithmetic overflows! 1083 */ 1084 if (cl->priority == prio) { 1085 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ 1086 q->quanta[prio]; 1087 } 1088 if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { 1089 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); 1090 cl->quantum = cl->qdisc->dev->mtu/2 + 1; 1091 } 1092 } 1093 } 1094} 1095 1096static void cbq_sync_defmap(struct cbq_class *cl) 1097{ 1098 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 1099 struct cbq_class *split = cl->split; 1100 unsigned h; 1101 int i; 1102 1103 if (split == NULL) 1104 return; 1105 1106 for (i=0; i<=TC_PRIO_MAX; i++) { 1107 if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) 1108 split->defaults[i] = NULL; 1109 } 1110 1111 for (i=0; i<=TC_PRIO_MAX; i++) { 1112 int level = split->level; 1113 1114 if (split->defaults[i]) 1115 continue; 1116 1117 for (h=0; h<16; h++) { 1118 struct cbq_class *c; 1119 1120 for (c = q->classes[h]; c; c = c->next) { 1121 if (c->split == split && c->level < level && 1122 c->defmap&(1<<i)) { 1123 split->defaults[i] = c; 1124 level = c->level; 1125 } 1126 } 1127 } 1128 } 1129} 1130 1131static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) 1132{ 1133 struct cbq_class *split = NULL; 1134 1135 if (splitid == 0) { 1136 if ((split = cl->split) == NULL) 1137 return; 1138 splitid = split->classid; 1139 } 1140 1141 if (split == NULL || split->classid != splitid) { 1142 for (split = cl->tparent; split; split = split->tparent) 1143 if (split->classid == splitid) 1144 break; 1145 } 1146 1147 if (split == NULL) 1148 return; 1149 1150 if (cl->split != split) { 1151 cl->defmap = 0; 1152 cbq_sync_defmap(cl); 1153 cl->split = split; 1154 cl->defmap = def&mask; 1155 } else 1156 cl->defmap = (cl->defmap&~mask)|(def&mask); 1157 1158 cbq_sync_defmap(cl); 1159} 1160 1161static void cbq_unlink_class(struct cbq_class *this) 1162{ 1163 struct cbq_class *cl, **clp; 1164 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 1165 1166 for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) { 1167 if (cl == this) { 1168 *clp = cl->next; 1169 cl->next = NULL; 1170 break; 1171 } 1172 } 1173 1174 if (this->tparent) { 1175 clp=&this->sibling; 1176 cl = *clp; 1177 do { 1178 if (cl == this) { 1179 *clp = cl->sibling; 1180 break; 1181 } 1182 clp = &cl->sibling; 1183 } while ((cl = *clp) != this->sibling); 1184 1185 if (this->tparent->children == this) { 1186 this->tparent->children = this->sibling; 1187 if (this->sibling == this) 1188 this->tparent->children = NULL; 1189 } 1190 } else { 1191 BUG_TRAP(this->sibling == this); 1192 } 1193} 1194 1195static void cbq_link_class(struct cbq_class *this) 1196{ 1197 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 1198 unsigned h = cbq_hash(this->classid); 1199 struct cbq_class *parent = this->tparent; 1200 1201 this->sibling = this; 1202 this->next = q->classes[h]; 1203 q->classes[h] = this; 1204 1205 if (parent == NULL) 1206 return; 1207 1208 if (parent->children == NULL) { 1209 parent->children = this; 1210 } else { 1211 this->sibling = parent->children->sibling; 1212 parent->children->sibling = this; 1213 } 1214} 1215 1216static unsigned int cbq_drop(struct Qdisc* sch) 1217{ 1218 struct cbq_sched_data *q = qdisc_priv(sch); 1219 struct cbq_class *cl, *cl_head; 1220 int prio; 1221 unsigned int len; 1222 1223 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { 1224 if ((cl_head = q->active[prio]) == NULL) 1225 continue; 1226 1227 cl = cl_head; 1228 do { 1229 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { 1230 sch->q.qlen--; 1231 if (!cl->q->q.qlen) 1232 cbq_deactivate_class(cl); 1233 return len; 1234 } 1235 } while ((cl = cl->next_alive) != cl_head); 1236 } 1237 return 0; 1238} 1239 1240static void 1241cbq_reset(struct Qdisc* sch) 1242{ 1243 struct cbq_sched_data *q = qdisc_priv(sch); 1244 struct cbq_class *cl; 1245 int prio; 1246 unsigned h; 1247 1248 q->activemask = 0; 1249 q->pmask = 0; 1250 q->tx_class = NULL; 1251 q->tx_borrowed = NULL; 1252 qdisc_watchdog_cancel(&q->watchdog); 1253 hrtimer_cancel(&q->delay_timer); 1254 q->toplevel = TC_CBQ_MAXLEVEL; 1255 q->now = psched_get_time(); 1256 q->now_rt = q->now; 1257 1258 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) 1259 q->active[prio] = NULL; 1260 1261 for (h = 0; h < 16; h++) { 1262 for (cl = q->classes[h]; cl; cl = cl->next) { 1263 qdisc_reset(cl->q); 1264 1265 cl->next_alive = NULL; 1266 cl->undertime = PSCHED_PASTPERFECT; 1267 cl->avgidle = cl->maxidle; 1268 cl->deficit = cl->quantum; 1269 cl->cpriority = cl->priority; 1270 } 1271 } 1272 sch->q.qlen = 0; 1273} 1274 1275 1276static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) 1277{ 1278 if (lss->change&TCF_CBQ_LSS_FLAGS) { 1279 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; 1280 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; 1281 } 1282 if (lss->change&TCF_CBQ_LSS_EWMA) 1283 cl->ewma_log = lss->ewma_log; 1284 if (lss->change&TCF_CBQ_LSS_AVPKT) 1285 cl->avpkt = lss->avpkt; 1286 if (lss->change&TCF_CBQ_LSS_MINIDLE) 1287 cl->minidle = -(long)lss->minidle; 1288 if (lss->change&TCF_CBQ_LSS_MAXIDLE) { 1289 cl->maxidle = lss->maxidle; 1290 cl->avgidle = lss->maxidle; 1291 } 1292 if (lss->change&TCF_CBQ_LSS_OFFTIME) 1293 cl->offtime = lss->offtime; 1294 return 0; 1295} 1296 1297static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) 1298{ 1299 q->nclasses[cl->priority]--; 1300 q->quanta[cl->priority] -= cl->weight; 1301 cbq_normalize_quanta(q, cl->priority); 1302} 1303 1304static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) 1305{ 1306 q->nclasses[cl->priority]++; 1307 q->quanta[cl->priority] += cl->weight; 1308 cbq_normalize_quanta(q, cl->priority); 1309} 1310 1311static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) 1312{ 1313 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 1314 1315 if (wrr->allot) 1316 cl->allot = wrr->allot; 1317 if (wrr->weight) 1318 cl->weight = wrr->weight; 1319 if (wrr->priority) { 1320 cl->priority = wrr->priority-1; 1321 cl->cpriority = cl->priority; 1322 if (cl->priority >= cl->priority2) 1323 cl->priority2 = TC_CBQ_MAXPRIO-1; 1324 } 1325 1326 cbq_addprio(q, cl); 1327 return 0; 1328} 1329 1330static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) 1331{ 1332 switch (ovl->strategy) { 1333 case TC_CBQ_OVL_CLASSIC: 1334 cl->overlimit = cbq_ovl_classic; 1335 break; 1336 case TC_CBQ_OVL_DELAY: 1337 cl->overlimit = cbq_ovl_delay; 1338 break; 1339 case TC_CBQ_OVL_LOWPRIO: 1340 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || 1341 ovl->priority2-1 <= cl->priority) 1342 return -EINVAL; 1343 cl->priority2 = ovl->priority2-1; 1344 cl->overlimit = cbq_ovl_lowprio; 1345 break; 1346 case TC_CBQ_OVL_DROP: 1347 cl->overlimit = cbq_ovl_drop; 1348 break; 1349 case TC_CBQ_OVL_RCLASSIC: 1350 cl->overlimit = cbq_ovl_rclassic; 1351 break; 1352 default: 1353 return -EINVAL; 1354 } 1355 cl->penalty = ovl->penalty; 1356 return 0; 1357} 1358 1359#ifdef CONFIG_NET_CLS_ACT 1360static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) 1361{ 1362 cl->police = p->police; 1363 1364 if (cl->q->handle) { 1365 if (p->police == TC_POLICE_RECLASSIFY) 1366 cl->q->reshape_fail = cbq_reshape_fail; 1367 else 1368 cl->q->reshape_fail = NULL; 1369 } 1370 return 0; 1371} 1372#endif 1373 1374static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) 1375{ 1376 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); 1377 return 0; 1378} 1379 1380static int cbq_init(struct Qdisc *sch, struct rtattr *opt) 1381{ 1382 struct cbq_sched_data *q = qdisc_priv(sch); 1383 struct rtattr *tb[TCA_CBQ_MAX]; 1384 struct tc_ratespec *r; 1385 1386 if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 || 1387 tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || 1388 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) 1389 return -EINVAL; 1390 1391 if (tb[TCA_CBQ_LSSOPT-1] && 1392 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) 1393 return -EINVAL; 1394 1395 r = RTA_DATA(tb[TCA_CBQ_RATE-1]); 1396 1397 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) 1398 return -EINVAL; 1399 1400 q->link.refcnt = 1; 1401 q->link.sibling = &q->link; 1402 q->link.classid = sch->handle; 1403 q->link.qdisc = sch; 1404 if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, 1405 sch->handle))) 1406 q->link.q = &noop_qdisc; 1407 1408 q->link.priority = TC_CBQ_MAXPRIO-1; 1409 q->link.priority2 = TC_CBQ_MAXPRIO-1; 1410 q->link.cpriority = TC_CBQ_MAXPRIO-1; 1411 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; 1412 q->link.overlimit = cbq_ovl_classic; 1413 q->link.allot = psched_mtu(sch->dev); 1414 q->link.quantum = q->link.allot; 1415 q->link.weight = q->link.R_tab->rate.rate; 1416 1417 q->link.ewma_log = TC_CBQ_DEF_EWMA; 1418 q->link.avpkt = q->link.allot/2; 1419 q->link.minidle = -0x7FFFFFFF; 1420 1421 qdisc_watchdog_init(&q->watchdog, sch); 1422 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1423 q->delay_timer.function = cbq_undelay; 1424 q->toplevel = TC_CBQ_MAXLEVEL; 1425 q->now = psched_get_time(); 1426 q->now_rt = q->now; 1427 1428 cbq_link_class(&q->link); 1429 1430 if (tb[TCA_CBQ_LSSOPT-1]) 1431 cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); 1432 1433 cbq_addprio(q, &q->link); 1434 return 0; 1435} 1436 1437static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) 1438{ 1439 unsigned char *b = skb_tail_pointer(skb); 1440 1441 RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); 1442 return skb->len; 1443 1444rtattr_failure: 1445 nlmsg_trim(skb, b); 1446 return -1; 1447} 1448 1449static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) 1450{ 1451 unsigned char *b = skb_tail_pointer(skb); 1452 struct tc_cbq_lssopt opt; 1453 1454 opt.flags = 0; 1455 if (cl->borrow == NULL) 1456 opt.flags |= TCF_CBQ_LSS_BOUNDED; 1457 if (cl->share == NULL) 1458 opt.flags |= TCF_CBQ_LSS_ISOLATED; 1459 opt.ewma_log = cl->ewma_log; 1460 opt.level = cl->level; 1461 opt.avpkt = cl->avpkt; 1462 opt.maxidle = cl->maxidle; 1463 opt.minidle = (u32)(-cl->minidle); 1464 opt.offtime = cl->offtime; 1465 opt.change = ~0; 1466 RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); 1467 return skb->len; 1468 1469rtattr_failure: 1470 nlmsg_trim(skb, b); 1471 return -1; 1472} 1473 1474static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) 1475{ 1476 unsigned char *b = skb_tail_pointer(skb); 1477 struct tc_cbq_wrropt opt; 1478 1479 opt.flags = 0; 1480 opt.allot = cl->allot; 1481 opt.priority = cl->priority+1; 1482 opt.cpriority = cl->cpriority+1; 1483 opt.weight = cl->weight; 1484 RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); 1485 return skb->len; 1486 1487rtattr_failure: 1488 nlmsg_trim(skb, b); 1489 return -1; 1490} 1491 1492static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) 1493{ 1494 unsigned char *b = skb_tail_pointer(skb); 1495 struct tc_cbq_ovl opt; 1496 1497 opt.strategy = cl->ovl_strategy; 1498 opt.priority2 = cl->priority2+1; 1499 opt.pad = 0; 1500 opt.penalty = cl->penalty; 1501 RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); 1502 return skb->len; 1503 1504rtattr_failure: 1505 nlmsg_trim(skb, b); 1506 return -1; 1507} 1508 1509static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) 1510{ 1511 unsigned char *b = skb_tail_pointer(skb); 1512 struct tc_cbq_fopt opt; 1513 1514 if (cl->split || cl->defmap) { 1515 opt.split = cl->split ? cl->split->classid : 0; 1516 opt.defmap = cl->defmap; 1517 opt.defchange = ~0; 1518 RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); 1519 } 1520 return skb->len; 1521 1522rtattr_failure: 1523 nlmsg_trim(skb, b); 1524 return -1; 1525} 1526 1527#ifdef CONFIG_NET_CLS_ACT 1528static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) 1529{ 1530 unsigned char *b = skb_tail_pointer(skb); 1531 struct tc_cbq_police opt; 1532 1533 if (cl->police) { 1534 opt.police = cl->police; 1535 opt.__res1 = 0; 1536 opt.__res2 = 0; 1537 RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); 1538 } 1539 return skb->len; 1540 1541rtattr_failure: 1542 nlmsg_trim(skb, b); 1543 return -1; 1544} 1545#endif 1546 1547static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) 1548{ 1549 if (cbq_dump_lss(skb, cl) < 0 || 1550 cbq_dump_rate(skb, cl) < 0 || 1551 cbq_dump_wrr(skb, cl) < 0 || 1552 cbq_dump_ovl(skb, cl) < 0 || 1553#ifdef CONFIG_NET_CLS_ACT 1554 cbq_dump_police(skb, cl) < 0 || 1555#endif 1556 cbq_dump_fopt(skb, cl) < 0) 1557 return -1; 1558 return 0; 1559} 1560 1561static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) 1562{ 1563 struct cbq_sched_data *q = qdisc_priv(sch); 1564 unsigned char *b = skb_tail_pointer(skb); 1565 struct rtattr *rta; 1566 1567 rta = (struct rtattr*)b; 1568 RTA_PUT(skb, TCA_OPTIONS, 0, NULL); 1569 if (cbq_dump_attr(skb, &q->link) < 0) 1570 goto rtattr_failure; 1571 rta->rta_len = skb_tail_pointer(skb) - b; 1572 return skb->len; 1573 1574rtattr_failure: 1575 nlmsg_trim(skb, b); 1576 return -1; 1577} 1578 1579static int 1580cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 1581{ 1582 struct cbq_sched_data *q = qdisc_priv(sch); 1583 1584 q->link.xstats.avgidle = q->link.avgidle; 1585 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); 1586} 1587 1588static int 1589cbq_dump_class(struct Qdisc *sch, unsigned long arg, 1590 struct sk_buff *skb, struct tcmsg *tcm) 1591{ 1592 struct cbq_class *cl = (struct cbq_class*)arg; 1593 unsigned char *b = skb_tail_pointer(skb); 1594 struct rtattr *rta; 1595 1596 if (cl->tparent) 1597 tcm->tcm_parent = cl->tparent->classid; 1598 else 1599 tcm->tcm_parent = TC_H_ROOT; 1600 tcm->tcm_handle = cl->classid; 1601 tcm->tcm_info = cl->q->handle; 1602 1603 rta = (struct rtattr*)b; 1604 RTA_PUT(skb, TCA_OPTIONS, 0, NULL); 1605 if (cbq_dump_attr(skb, cl) < 0) 1606 goto rtattr_failure; 1607 rta->rta_len = skb_tail_pointer(skb) - b; 1608 return skb->len; 1609 1610rtattr_failure: 1611 nlmsg_trim(skb, b); 1612 return -1; 1613} 1614 1615static int 1616cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, 1617 struct gnet_dump *d) 1618{ 1619 struct cbq_sched_data *q = qdisc_priv(sch); 1620 struct cbq_class *cl = (struct cbq_class*)arg; 1621 1622 cl->qstats.qlen = cl->q->q.qlen; 1623 cl->xstats.avgidle = cl->avgidle; 1624 cl->xstats.undertime = 0; 1625 1626 if (cl->undertime != PSCHED_PASTPERFECT) 1627 cl->xstats.undertime = cl->undertime - q->now; 1628 1629 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || 1630 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || 1631 gnet_stats_copy_queue(d, &cl->qstats) < 0) 1632 return -1; 1633 1634 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); 1635} 1636 1637static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1638 struct Qdisc **old) 1639{ 1640 struct cbq_class *cl = (struct cbq_class*)arg; 1641 1642 if (cl) { 1643 if (new == NULL) { 1644 if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, 1645 cl->classid)) == NULL) 1646 return -ENOBUFS; 1647 } else { 1648#ifdef CONFIG_NET_CLS_ACT 1649 if (cl->police == TC_POLICE_RECLASSIFY) 1650 new->reshape_fail = cbq_reshape_fail; 1651#endif 1652 } 1653 sch_tree_lock(sch); 1654 *old = xchg(&cl->q, new); 1655 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); 1656 qdisc_reset(*old); 1657 sch_tree_unlock(sch); 1658 1659 return 0; 1660 } 1661 return -ENOENT; 1662} 1663 1664static struct Qdisc * 1665cbq_leaf(struct Qdisc *sch, unsigned long arg) 1666{ 1667 struct cbq_class *cl = (struct cbq_class*)arg; 1668 1669 return cl ? cl->q : NULL; 1670} 1671 1672static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg) 1673{ 1674 struct cbq_class *cl = (struct cbq_class *)arg; 1675 1676 if (cl->q->q.qlen == 0) 1677 cbq_deactivate_class(cl); 1678} 1679 1680static unsigned long cbq_get(struct Qdisc *sch, u32 classid) 1681{ 1682 struct cbq_sched_data *q = qdisc_priv(sch); 1683 struct cbq_class *cl = cbq_class_lookup(q, classid); 1684 1685 if (cl) { 1686 cl->refcnt++; 1687 return (unsigned long)cl; 1688 } 1689 return 0; 1690} 1691 1692static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) 1693{ 1694 struct cbq_sched_data *q = qdisc_priv(sch); 1695 1696 BUG_TRAP(!cl->filters); 1697 1698 tcf_destroy_chain(cl->filter_list); 1699 qdisc_destroy(cl->q); 1700 qdisc_put_rtab(cl->R_tab); 1701 gen_kill_estimator(&cl->bstats, &cl->rate_est); 1702 if (cl != &q->link) 1703 kfree(cl); 1704} 1705 1706static void 1707cbq_destroy(struct Qdisc* sch) 1708{ 1709 struct cbq_sched_data *q = qdisc_priv(sch); 1710 struct cbq_class *cl; 1711 unsigned h; 1712 1713#ifdef CONFIG_NET_CLS_ACT 1714 q->rx_class = NULL; 1715#endif 1716 /* 1717 * Filters must be destroyed first because we don't destroy the 1718 * classes from root to leafs which means that filters can still 1719 * be bound to classes which have been destroyed already. --TGR '04 1720 */ 1721 for (h = 0; h < 16; h++) { 1722 for (cl = q->classes[h]; cl; cl = cl->next) { 1723 tcf_destroy_chain(cl->filter_list); 1724 cl->filter_list = NULL; 1725 } 1726 } 1727 for (h = 0; h < 16; h++) { 1728 struct cbq_class *next; 1729 1730 for (cl = q->classes[h]; cl; cl = next) { 1731 next = cl->next; 1732 cbq_destroy_class(sch, cl); 1733 } 1734 } 1735} 1736 1737static void cbq_put(struct Qdisc *sch, unsigned long arg) 1738{ 1739 struct cbq_class *cl = (struct cbq_class*)arg; 1740 1741 if (--cl->refcnt == 0) { 1742#ifdef CONFIG_NET_CLS_ACT 1743 struct cbq_sched_data *q = qdisc_priv(sch); 1744 1745 spin_lock_bh(&sch->dev->queue_lock); 1746 if (q->rx_class == cl) 1747 q->rx_class = NULL; 1748 spin_unlock_bh(&sch->dev->queue_lock); 1749#endif 1750 1751 cbq_destroy_class(sch, cl); 1752 } 1753} 1754 1755static int 1756cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, 1757 unsigned long *arg) 1758{ 1759 int err; 1760 struct cbq_sched_data *q = qdisc_priv(sch); 1761 struct cbq_class *cl = (struct cbq_class*)*arg; 1762 struct rtattr *opt = tca[TCA_OPTIONS-1]; 1763 struct rtattr *tb[TCA_CBQ_MAX]; 1764 struct cbq_class *parent; 1765 struct qdisc_rate_table *rtab = NULL; 1766 1767 if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt)) 1768 return -EINVAL; 1769 1770 if (tb[TCA_CBQ_OVL_STRATEGY-1] && 1771 RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) 1772 return -EINVAL; 1773 1774 if (tb[TCA_CBQ_FOPT-1] && 1775 RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) 1776 return -EINVAL; 1777 1778 if (tb[TCA_CBQ_RATE-1] && 1779 RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) 1780 return -EINVAL; 1781 1782 if (tb[TCA_CBQ_LSSOPT-1] && 1783 RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) 1784 return -EINVAL; 1785 1786 if (tb[TCA_CBQ_WRROPT-1] && 1787 RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) 1788 return -EINVAL; 1789 1790#ifdef CONFIG_NET_CLS_ACT 1791 if (tb[TCA_CBQ_POLICE-1] && 1792 RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) 1793 return -EINVAL; 1794#endif 1795 1796 if (cl) { 1797 /* Check parent */ 1798 if (parentid) { 1799 if (cl->tparent && cl->tparent->classid != parentid) 1800 return -EINVAL; 1801 if (!cl->tparent && parentid != TC_H_ROOT) 1802 return -EINVAL; 1803 } 1804 1805 if (tb[TCA_CBQ_RATE-1]) { 1806 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); 1807 if (rtab == NULL) 1808 return -EINVAL; 1809 } 1810 1811 /* Change class parameters */ 1812 sch_tree_lock(sch); 1813 1814 if (cl->next_alive != NULL) 1815 cbq_deactivate_class(cl); 1816 1817 if (rtab) { 1818 rtab = xchg(&cl->R_tab, rtab); 1819 qdisc_put_rtab(rtab); 1820 } 1821 1822 if (tb[TCA_CBQ_LSSOPT-1]) 1823 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); 1824 1825 if (tb[TCA_CBQ_WRROPT-1]) { 1826 cbq_rmprio(q, cl); 1827 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); 1828 } 1829 1830 if (tb[TCA_CBQ_OVL_STRATEGY-1]) 1831 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); 1832 1833#ifdef CONFIG_NET_CLS_ACT 1834 if (tb[TCA_CBQ_POLICE-1]) 1835 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); 1836#endif 1837 1838 if (tb[TCA_CBQ_FOPT-1]) 1839 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); 1840 1841 if (cl->q->q.qlen) 1842 cbq_activate_class(cl); 1843 1844 sch_tree_unlock(sch); 1845 1846 if (tca[TCA_RATE-1]) 1847 gen_replace_estimator(&cl->bstats, &cl->rate_est, 1848 &sch->dev->queue_lock, 1849 tca[TCA_RATE-1]); 1850 return 0; 1851 } 1852 1853 if (parentid == TC_H_ROOT) 1854 return -EINVAL; 1855 1856 if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || 1857 tb[TCA_CBQ_LSSOPT-1] == NULL) 1858 return -EINVAL; 1859 1860 rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); 1861 if (rtab == NULL) 1862 return -EINVAL; 1863 1864 if (classid) { 1865 err = -EINVAL; 1866 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) 1867 goto failure; 1868 } else { 1869 int i; 1870 classid = TC_H_MAKE(sch->handle,0x8000); 1871 1872 for (i=0; i<0x8000; i++) { 1873 if (++q->hgenerator >= 0x8000) 1874 q->hgenerator = 1; 1875 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) 1876 break; 1877 } 1878 err = -ENOSR; 1879 if (i >= 0x8000) 1880 goto failure; 1881 classid = classid|q->hgenerator; 1882 } 1883 1884 parent = &q->link; 1885 if (parentid) { 1886 parent = cbq_class_lookup(q, parentid); 1887 err = -EINVAL; 1888 if (parent == NULL) 1889 goto failure; 1890 } 1891 1892 err = -ENOBUFS; 1893 cl = kzalloc(sizeof(*cl), GFP_KERNEL); 1894 if (cl == NULL) 1895 goto failure; 1896 cl->R_tab = rtab; 1897 rtab = NULL; 1898 cl->refcnt = 1; 1899 if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid))) 1900 cl->q = &noop_qdisc; 1901 cl->classid = classid; 1902 cl->tparent = parent; 1903 cl->qdisc = sch; 1904 cl->allot = parent->allot; 1905 cl->quantum = cl->allot; 1906 cl->weight = cl->R_tab->rate.rate; 1907 1908 sch_tree_lock(sch); 1909 cbq_link_class(cl); 1910 cl->borrow = cl->tparent; 1911 if (cl->tparent != &q->link) 1912 cl->share = cl->tparent; 1913 cbq_adjust_levels(parent); 1914 cl->minidle = -0x7FFFFFFF; 1915 cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); 1916 cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); 1917 if (cl->ewma_log==0) 1918 cl->ewma_log = q->link.ewma_log; 1919 if (cl->maxidle==0) 1920 cl->maxidle = q->link.maxidle; 1921 if (cl->avpkt==0) 1922 cl->avpkt = q->link.avpkt; 1923 cl->overlimit = cbq_ovl_classic; 1924 if (tb[TCA_CBQ_OVL_STRATEGY-1]) 1925 cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); 1926#ifdef CONFIG_NET_CLS_ACT 1927 if (tb[TCA_CBQ_POLICE-1]) 1928 cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); 1929#endif 1930 if (tb[TCA_CBQ_FOPT-1]) 1931 cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); 1932 sch_tree_unlock(sch); 1933 1934 if (tca[TCA_RATE-1]) 1935 gen_new_estimator(&cl->bstats, &cl->rate_est, 1936 &sch->dev->queue_lock, tca[TCA_RATE-1]); 1937 1938 *arg = (unsigned long)cl; 1939 return 0; 1940 1941failure: 1942 qdisc_put_rtab(rtab); 1943 return err; 1944} 1945 1946static int cbq_delete(struct Qdisc *sch, unsigned long arg) 1947{ 1948 struct cbq_sched_data *q = qdisc_priv(sch); 1949 struct cbq_class *cl = (struct cbq_class*)arg; 1950 unsigned int qlen; 1951 1952 if (cl->filters || cl->children || cl == &q->link) 1953 return -EBUSY; 1954 1955 sch_tree_lock(sch); 1956 1957 qlen = cl->q->q.qlen; 1958 qdisc_reset(cl->q); 1959 qdisc_tree_decrease_qlen(cl->q, qlen); 1960 1961 if (cl->next_alive) 1962 cbq_deactivate_class(cl); 1963 1964 if (q->tx_borrowed == cl) 1965 q->tx_borrowed = q->tx_class; 1966 if (q->tx_class == cl) { 1967 q->tx_class = NULL; 1968 q->tx_borrowed = NULL; 1969 } 1970#ifdef CONFIG_NET_CLS_ACT 1971 if (q->rx_class == cl) 1972 q->rx_class = NULL; 1973#endif 1974 1975 cbq_unlink_class(cl); 1976 cbq_adjust_levels(cl->tparent); 1977 cl->defmap = 0; 1978 cbq_sync_defmap(cl); 1979 1980 cbq_rmprio(q, cl); 1981 sch_tree_unlock(sch); 1982 1983 if (--cl->refcnt == 0) 1984 cbq_destroy_class(sch, cl); 1985 1986 return 0; 1987} 1988 1989static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) 1990{ 1991 struct cbq_sched_data *q = qdisc_priv(sch); 1992 struct cbq_class *cl = (struct cbq_class *)arg; 1993 1994 if (cl == NULL) 1995 cl = &q->link; 1996 1997 return &cl->filter_list; 1998} 1999 2000static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, 2001 u32 classid) 2002{ 2003 struct cbq_sched_data *q = qdisc_priv(sch); 2004 struct cbq_class *p = (struct cbq_class*)parent; 2005 struct cbq_class *cl = cbq_class_lookup(q, classid); 2006 2007 if (cl) { 2008 if (p && p->level <= cl->level) 2009 return 0; 2010 cl->filters++; 2011 return (unsigned long)cl; 2012 } 2013 return 0; 2014} 2015 2016static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) 2017{ 2018 struct cbq_class *cl = (struct cbq_class*)arg; 2019 2020 cl->filters--; 2021} 2022 2023static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) 2024{ 2025 struct cbq_sched_data *q = qdisc_priv(sch); 2026 unsigned h; 2027 2028 if (arg->stop) 2029 return; 2030 2031 for (h = 0; h < 16; h++) { 2032 struct cbq_class *cl; 2033 2034 for (cl = q->classes[h]; cl; cl = cl->next) { 2035 if (arg->count < arg->skip) { 2036 arg->count++; 2037 continue; 2038 } 2039 if (arg->fn(sch, (unsigned long)cl, arg) < 0) { 2040 arg->stop = 1; 2041 return; 2042 } 2043 arg->count++; 2044 } 2045 } 2046} 2047 2048static struct Qdisc_class_ops cbq_class_ops = { 2049 .graft = cbq_graft, 2050 .leaf = cbq_leaf, 2051 .qlen_notify = cbq_qlen_notify, 2052 .get = cbq_get, 2053 .put = cbq_put, 2054 .change = cbq_change_class, 2055 .delete = cbq_delete, 2056 .walk = cbq_walk, 2057 .tcf_chain = cbq_find_tcf, 2058 .bind_tcf = cbq_bind_filter, 2059 .unbind_tcf = cbq_unbind_filter, 2060 .dump = cbq_dump_class, 2061 .dump_stats = cbq_dump_class_stats, 2062}; 2063 2064static struct Qdisc_ops cbq_qdisc_ops = { 2065 .next = NULL, 2066 .cl_ops = &cbq_class_ops, 2067 .id = "cbq", 2068 .priv_size = sizeof(struct cbq_sched_data), 2069 .enqueue = cbq_enqueue, 2070 .dequeue = cbq_dequeue, 2071 .requeue = cbq_requeue, 2072 .drop = cbq_drop, 2073 .init = cbq_init, 2074 .reset = cbq_reset, 2075 .destroy = cbq_destroy, 2076 .change = NULL, 2077 .dump = cbq_dump, 2078 .dump_stats = cbq_dump_stats, 2079 .owner = THIS_MODULE, 2080}; 2081 2082static int __init cbq_module_init(void) 2083{ 2084 return register_qdisc(&cbq_qdisc_ops); 2085} 2086static void __exit cbq_module_exit(void) 2087{ 2088 unregister_qdisc(&cbq_qdisc_ops); 2089} 2090module_init(cbq_module_init) 2091module_exit(cbq_module_exit) 2092MODULE_LICENSE("GPL");