Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.31-rc4 2075 lines 49 kB view raw
1/* 2 * net/sched/sch_cbq.c Class-Based Queueing discipline. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * 11 */ 12 13#include <linux/module.h> 14#include <linux/types.h> 15#include <linux/kernel.h> 16#include <linux/string.h> 17#include <linux/errno.h> 18#include <linux/skbuff.h> 19#include <net/netlink.h> 20#include <net/pkt_sched.h> 21 22 23/* Class-Based Queueing (CBQ) algorithm. 24 ======================================= 25 26 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource 27 Management Models for Packet Networks", 28 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 29 30 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 31 32 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting 33 Parameters", 1996 34 35 [4] Sally Floyd and Michael Speer, "Experimental Results 36 for Class-Based Queueing", 1998, not published. 37 38 ----------------------------------------------------------------------- 39 40 Algorithm skeleton was taken from NS simulator cbq.cc. 41 If someone wants to check this code against the LBL version, 42 he should take into account that ONLY the skeleton was borrowed, 43 the implementation is different. Particularly: 44 45 --- The WRR algorithm is different. Our version looks more 46 reasonable (I hope) and works when quanta are allowed to be 47 less than MTU, which is always the case when real time classes 48 have small rates. Note, that the statement of [3] is 49 incomplete, delay may actually be estimated even if class 50 per-round allotment is less than MTU. Namely, if per-round 51 allotment is W*r_i, and r_1+...+r_k = r < 1 52 53 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B 54 55 In the worst case we have IntServ estimate with D = W*r+k*MTU 56 and C = MTU*r. The proof (if correct at all) is trivial. 57 58 59 --- It seems that cbq-2.0 is not very accurate. At least, I cannot 60 interpret some places, which look like wrong translations 61 from NS. Anyone is advised to find these differences 62 and explain to me, why I am wrong 8). 63 64 --- Linux has no EOI event, so that we cannot estimate true class 65 idle time. Workaround is to consider the next dequeue event 66 as sign that previous packet is finished. This is wrong because of 67 internal device queueing, but on a permanently loaded link it is true. 68 Moreover, combined with clock integrator, this scheme looks 69 very close to an ideal solution. */ 70 71struct cbq_sched_data; 72 73 74struct cbq_class 75{ 76 struct Qdisc_class_common common; 77 struct cbq_class *next_alive; /* next class with backlog in this priority band */ 78 79/* Parameters */ 80 unsigned char priority; /* class priority */ 81 unsigned char priority2; /* priority to be used after overlimit */ 82 unsigned char ewma_log; /* time constant for idle time calculation */ 83 unsigned char ovl_strategy; 84#ifdef CONFIG_NET_CLS_ACT 85 unsigned char police; 86#endif 87 88 u32 defmap; 89 90 /* Link-sharing scheduler parameters */ 91 long maxidle; /* Class parameters: see below. */ 92 long offtime; 93 long minidle; 94 u32 avpkt; 95 struct qdisc_rate_table *R_tab; 96 97 /* Overlimit strategy parameters */ 98 void (*overlimit)(struct cbq_class *cl); 99 psched_tdiff_t penalty; 100 101 /* General scheduler (WRR) parameters */ 102 long allot; 103 long quantum; /* Allotment per WRR round */ 104 long weight; /* Relative allotment: see below */ 105 106 struct Qdisc *qdisc; /* Ptr to CBQ discipline */ 107 struct cbq_class *split; /* Ptr to split node */ 108 struct cbq_class *share; /* Ptr to LS parent in the class tree */ 109 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ 110 struct cbq_class *borrow; /* NULL if class is bandwidth limited; 111 parent otherwise */ 112 struct cbq_class *sibling; /* Sibling chain */ 113 struct cbq_class *children; /* Pointer to children chain */ 114 115 struct Qdisc *q; /* Elementary queueing discipline */ 116 117 118/* Variables */ 119 unsigned char cpriority; /* Effective priority */ 120 unsigned char delayed; 121 unsigned char level; /* level of the class in hierarchy: 122 0 for leaf classes, and maximal 123 level of children + 1 for nodes. 124 */ 125 126 psched_time_t last; /* Last end of service */ 127 psched_time_t undertime; 128 long avgidle; 129 long deficit; /* Saved deficit for WRR */ 130 psched_time_t penalized; 131 struct gnet_stats_basic bstats; 132 struct gnet_stats_queue qstats; 133 struct gnet_stats_rate_est rate_est; 134 struct tc_cbq_xstats xstats; 135 136 struct tcf_proto *filter_list; 137 138 int refcnt; 139 int filters; 140 141 struct cbq_class *defaults[TC_PRIO_MAX+1]; 142}; 143 144struct cbq_sched_data 145{ 146 struct Qdisc_class_hash clhash; /* Hash table of all classes */ 147 int nclasses[TC_CBQ_MAXPRIO+1]; 148 unsigned quanta[TC_CBQ_MAXPRIO+1]; 149 150 struct cbq_class link; 151 152 unsigned activemask; 153 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes 154 with backlog */ 155 156#ifdef CONFIG_NET_CLS_ACT 157 struct cbq_class *rx_class; 158#endif 159 struct cbq_class *tx_class; 160 struct cbq_class *tx_borrowed; 161 int tx_len; 162 psched_time_t now; /* Cached timestamp */ 163 psched_time_t now_rt; /* Cached real time */ 164 unsigned pmask; 165 166 struct hrtimer delay_timer; 167 struct qdisc_watchdog watchdog; /* Watchdog timer, 168 started when CBQ has 169 backlog, but cannot 170 transmit just now */ 171 psched_tdiff_t wd_expires; 172 int toplevel; 173 u32 hgenerator; 174}; 175 176 177#define L2T(cl,len) qdisc_l2t((cl)->R_tab,len) 178 179static __inline__ struct cbq_class * 180cbq_class_lookup(struct cbq_sched_data *q, u32 classid) 181{ 182 struct Qdisc_class_common *clc; 183 184 clc = qdisc_class_find(&q->clhash, classid); 185 if (clc == NULL) 186 return NULL; 187 return container_of(clc, struct cbq_class, common); 188} 189 190#ifdef CONFIG_NET_CLS_ACT 191 192static struct cbq_class * 193cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) 194{ 195 struct cbq_class *cl, *new; 196 197 for (cl = this->tparent; cl; cl = cl->tparent) 198 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) 199 return new; 200 201 return NULL; 202} 203 204#endif 205 206/* Classify packet. The procedure is pretty complicated, but 207 it allows us to combine link sharing and priority scheduling 208 transparently. 209 210 Namely, you can put link sharing rules (f.e. route based) at root of CBQ, 211 so that it resolves to split nodes. Then packets are classified 212 by logical priority, or a more specific classifier may be attached 213 to the split node. 214 */ 215 216static struct cbq_class * 217cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) 218{ 219 struct cbq_sched_data *q = qdisc_priv(sch); 220 struct cbq_class *head = &q->link; 221 struct cbq_class **defmap; 222 struct cbq_class *cl = NULL; 223 u32 prio = skb->priority; 224 struct tcf_result res; 225 226 /* 227 * Step 1. If skb->priority points to one of our classes, use it. 228 */ 229 if (TC_H_MAJ(prio^sch->handle) == 0 && 230 (cl = cbq_class_lookup(q, prio)) != NULL) 231 return cl; 232 233 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 234 for (;;) { 235 int result = 0; 236 defmap = head->defaults; 237 238 /* 239 * Step 2+n. Apply classifier. 240 */ 241 if (!head->filter_list || 242 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) 243 goto fallback; 244 245 if ((cl = (void*)res.class) == NULL) { 246 if (TC_H_MAJ(res.classid)) 247 cl = cbq_class_lookup(q, res.classid); 248 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) 249 cl = defmap[TC_PRIO_BESTEFFORT]; 250 251 if (cl == NULL || cl->level >= head->level) 252 goto fallback; 253 } 254 255#ifdef CONFIG_NET_CLS_ACT 256 switch (result) { 257 case TC_ACT_QUEUED: 258 case TC_ACT_STOLEN: 259 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 260 case TC_ACT_SHOT: 261 return NULL; 262 case TC_ACT_RECLASSIFY: 263 return cbq_reclassify(skb, cl); 264 } 265#endif 266 if (cl->level == 0) 267 return cl; 268 269 /* 270 * Step 3+n. If classifier selected a link sharing class, 271 * apply agency specific classifier. 272 * Repeat this procdure until we hit a leaf node. 273 */ 274 head = cl; 275 } 276 277fallback: 278 cl = head; 279 280 /* 281 * Step 4. No success... 282 */ 283 if (TC_H_MAJ(prio) == 0 && 284 !(cl = head->defaults[prio&TC_PRIO_MAX]) && 285 !(cl = head->defaults[TC_PRIO_BESTEFFORT])) 286 return head; 287 288 return cl; 289} 290 291/* 292 A packet has just been enqueued on the empty class. 293 cbq_activate_class adds it to the tail of active class list 294 of its priority band. 295 */ 296 297static __inline__ void cbq_activate_class(struct cbq_class *cl) 298{ 299 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 300 int prio = cl->cpriority; 301 struct cbq_class *cl_tail; 302 303 cl_tail = q->active[prio]; 304 q->active[prio] = cl; 305 306 if (cl_tail != NULL) { 307 cl->next_alive = cl_tail->next_alive; 308 cl_tail->next_alive = cl; 309 } else { 310 cl->next_alive = cl; 311 q->activemask |= (1<<prio); 312 } 313} 314 315/* 316 Unlink class from active chain. 317 Note that this same procedure is done directly in cbq_dequeue* 318 during round-robin procedure. 319 */ 320 321static void cbq_deactivate_class(struct cbq_class *this) 322{ 323 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 324 int prio = this->cpriority; 325 struct cbq_class *cl; 326 struct cbq_class *cl_prev = q->active[prio]; 327 328 do { 329 cl = cl_prev->next_alive; 330 if (cl == this) { 331 cl_prev->next_alive = cl->next_alive; 332 cl->next_alive = NULL; 333 334 if (cl == q->active[prio]) { 335 q->active[prio] = cl_prev; 336 if (cl == q->active[prio]) { 337 q->active[prio] = NULL; 338 q->activemask &= ~(1<<prio); 339 return; 340 } 341 } 342 return; 343 } 344 } while ((cl_prev = cl) != q->active[prio]); 345} 346 347static void 348cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) 349{ 350 int toplevel = q->toplevel; 351 352 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { 353 psched_time_t now; 354 psched_tdiff_t incr; 355 356 now = psched_get_time(); 357 incr = now - q->now_rt; 358 now = q->now + incr; 359 360 do { 361 if (cl->undertime < now) { 362 q->toplevel = cl->level; 363 return; 364 } 365 } while ((cl=cl->borrow) != NULL && toplevel > cl->level); 366 } 367} 368 369static int 370cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 371{ 372 struct cbq_sched_data *q = qdisc_priv(sch); 373 int uninitialized_var(ret); 374 struct cbq_class *cl = cbq_classify(skb, sch, &ret); 375 376#ifdef CONFIG_NET_CLS_ACT 377 q->rx_class = cl; 378#endif 379 if (cl == NULL) { 380 if (ret & __NET_XMIT_BYPASS) 381 sch->qstats.drops++; 382 kfree_skb(skb); 383 return ret; 384 } 385 386#ifdef CONFIG_NET_CLS_ACT 387 cl->q->__parent = sch; 388#endif 389 ret = qdisc_enqueue(skb, cl->q); 390 if (ret == NET_XMIT_SUCCESS) { 391 sch->q.qlen++; 392 sch->bstats.packets++; 393 sch->bstats.bytes += qdisc_pkt_len(skb); 394 cbq_mark_toplevel(q, cl); 395 if (!cl->next_alive) 396 cbq_activate_class(cl); 397 return ret; 398 } 399 400 if (net_xmit_drop_count(ret)) { 401 sch->qstats.drops++; 402 cbq_mark_toplevel(q, cl); 403 cl->qstats.drops++; 404 } 405 return ret; 406} 407 408/* Overlimit actions */ 409 410/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ 411 412static void cbq_ovl_classic(struct cbq_class *cl) 413{ 414 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 415 psched_tdiff_t delay = cl->undertime - q->now; 416 417 if (!cl->delayed) { 418 delay += cl->offtime; 419 420 /* 421 Class goes to sleep, so that it will have no 422 chance to work avgidle. Let's forgive it 8) 423 424 BTW cbq-2.0 has a crap in this 425 place, apparently they forgot to shift it by cl->ewma_log. 426 */ 427 if (cl->avgidle < 0) 428 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); 429 if (cl->avgidle < cl->minidle) 430 cl->avgidle = cl->minidle; 431 if (delay <= 0) 432 delay = 1; 433 cl->undertime = q->now + delay; 434 435 cl->xstats.overactions++; 436 cl->delayed = 1; 437 } 438 if (q->wd_expires == 0 || q->wd_expires > delay) 439 q->wd_expires = delay; 440 441 /* Dirty work! We must schedule wakeups based on 442 real available rate, rather than leaf rate, 443 which may be tiny (even zero). 444 */ 445 if (q->toplevel == TC_CBQ_MAXLEVEL) { 446 struct cbq_class *b; 447 psched_tdiff_t base_delay = q->wd_expires; 448 449 for (b = cl->borrow; b; b = b->borrow) { 450 delay = b->undertime - q->now; 451 if (delay < base_delay) { 452 if (delay <= 0) 453 delay = 1; 454 base_delay = delay; 455 } 456 } 457 458 q->wd_expires = base_delay; 459 } 460} 461 462/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when 463 they go overlimit 464 */ 465 466static void cbq_ovl_rclassic(struct cbq_class *cl) 467{ 468 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 469 struct cbq_class *this = cl; 470 471 do { 472 if (cl->level > q->toplevel) { 473 cl = NULL; 474 break; 475 } 476 } while ((cl = cl->borrow) != NULL); 477 478 if (cl == NULL) 479 cl = this; 480 cbq_ovl_classic(cl); 481} 482 483/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ 484 485static void cbq_ovl_delay(struct cbq_class *cl) 486{ 487 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 488 psched_tdiff_t delay = cl->undertime - q->now; 489 490 if (test_bit(__QDISC_STATE_DEACTIVATED, 491 &qdisc_root_sleeping(cl->qdisc)->state)) 492 return; 493 494 if (!cl->delayed) { 495 psched_time_t sched = q->now; 496 ktime_t expires; 497 498 delay += cl->offtime; 499 if (cl->avgidle < 0) 500 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); 501 if (cl->avgidle < cl->minidle) 502 cl->avgidle = cl->minidle; 503 cl->undertime = q->now + delay; 504 505 if (delay > 0) { 506 sched += delay + cl->penalty; 507 cl->penalized = sched; 508 cl->cpriority = TC_CBQ_MAXPRIO; 509 q->pmask |= (1<<TC_CBQ_MAXPRIO); 510 511 expires = ktime_set(0, 0); 512 expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched)); 513 if (hrtimer_try_to_cancel(&q->delay_timer) && 514 ktime_to_ns(ktime_sub( 515 hrtimer_get_expires(&q->delay_timer), 516 expires)) > 0) 517 hrtimer_set_expires(&q->delay_timer, expires); 518 hrtimer_restart(&q->delay_timer); 519 cl->delayed = 1; 520 cl->xstats.overactions++; 521 return; 522 } 523 delay = 1; 524 } 525 if (q->wd_expires == 0 || q->wd_expires > delay) 526 q->wd_expires = delay; 527} 528 529/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ 530 531static void cbq_ovl_lowprio(struct cbq_class *cl) 532{ 533 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 534 535 cl->penalized = q->now + cl->penalty; 536 537 if (cl->cpriority != cl->priority2) { 538 cl->cpriority = cl->priority2; 539 q->pmask |= (1<<cl->cpriority); 540 cl->xstats.overactions++; 541 } 542 cbq_ovl_classic(cl); 543} 544 545/* TC_CBQ_OVL_DROP: penalize class by dropping */ 546 547static void cbq_ovl_drop(struct cbq_class *cl) 548{ 549 if (cl->q->ops->drop) 550 if (cl->q->ops->drop(cl->q)) 551 cl->qdisc->q.qlen--; 552 cl->xstats.overactions++; 553 cbq_ovl_classic(cl); 554} 555 556static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio, 557 psched_time_t now) 558{ 559 struct cbq_class *cl; 560 struct cbq_class *cl_prev = q->active[prio]; 561 psched_time_t sched = now; 562 563 if (cl_prev == NULL) 564 return 0; 565 566 do { 567 cl = cl_prev->next_alive; 568 if (now - cl->penalized > 0) { 569 cl_prev->next_alive = cl->next_alive; 570 cl->next_alive = NULL; 571 cl->cpriority = cl->priority; 572 cl->delayed = 0; 573 cbq_activate_class(cl); 574 575 if (cl == q->active[prio]) { 576 q->active[prio] = cl_prev; 577 if (cl == q->active[prio]) { 578 q->active[prio] = NULL; 579 return 0; 580 } 581 } 582 583 cl = cl_prev->next_alive; 584 } else if (sched - cl->penalized > 0) 585 sched = cl->penalized; 586 } while ((cl_prev = cl) != q->active[prio]); 587 588 return sched - now; 589} 590 591static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) 592{ 593 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data, 594 delay_timer); 595 struct Qdisc *sch = q->watchdog.qdisc; 596 psched_time_t now; 597 psched_tdiff_t delay = 0; 598 unsigned pmask; 599 600 now = psched_get_time(); 601 602 pmask = q->pmask; 603 q->pmask = 0; 604 605 while (pmask) { 606 int prio = ffz(~pmask); 607 psched_tdiff_t tmp; 608 609 pmask &= ~(1<<prio); 610 611 tmp = cbq_undelay_prio(q, prio, now); 612 if (tmp > 0) { 613 q->pmask |= 1<<prio; 614 if (tmp < delay || delay == 0) 615 delay = tmp; 616 } 617 } 618 619 if (delay) { 620 ktime_t time; 621 622 time = ktime_set(0, 0); 623 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay)); 624 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); 625 } 626 627 sch->flags &= ~TCQ_F_THROTTLED; 628 __netif_schedule(qdisc_root(sch)); 629 return HRTIMER_NORESTART; 630} 631 632#ifdef CONFIG_NET_CLS_ACT 633static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) 634{ 635 struct Qdisc *sch = child->__parent; 636 struct cbq_sched_data *q = qdisc_priv(sch); 637 struct cbq_class *cl = q->rx_class; 638 639 q->rx_class = NULL; 640 641 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { 642 int ret; 643 644 cbq_mark_toplevel(q, cl); 645 646 q->rx_class = cl; 647 cl->q->__parent = sch; 648 649 ret = qdisc_enqueue(skb, cl->q); 650 if (ret == NET_XMIT_SUCCESS) { 651 sch->q.qlen++; 652 sch->bstats.packets++; 653 sch->bstats.bytes += qdisc_pkt_len(skb); 654 if (!cl->next_alive) 655 cbq_activate_class(cl); 656 return 0; 657 } 658 if (net_xmit_drop_count(ret)) 659 sch->qstats.drops++; 660 return 0; 661 } 662 663 sch->qstats.drops++; 664 return -1; 665} 666#endif 667 668/* 669 It is mission critical procedure. 670 671 We "regenerate" toplevel cutoff, if transmitting class 672 has backlog and it is not regulated. It is not part of 673 original CBQ description, but looks more reasonable. 674 Probably, it is wrong. This question needs further investigation. 675*/ 676 677static __inline__ void 678cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, 679 struct cbq_class *borrowed) 680{ 681 if (cl && q->toplevel >= borrowed->level) { 682 if (cl->q->q.qlen > 1) { 683 do { 684 if (borrowed->undertime == PSCHED_PASTPERFECT) { 685 q->toplevel = borrowed->level; 686 return; 687 } 688 } while ((borrowed=borrowed->borrow) != NULL); 689 } 690#if 0 691 /* It is not necessary now. Uncommenting it 692 will save CPU cycles, but decrease fairness. 693 */ 694 q->toplevel = TC_CBQ_MAXLEVEL; 695#endif 696 } 697} 698 699static void 700cbq_update(struct cbq_sched_data *q) 701{ 702 struct cbq_class *this = q->tx_class; 703 struct cbq_class *cl = this; 704 int len = q->tx_len; 705 706 q->tx_class = NULL; 707 708 for ( ; cl; cl = cl->share) { 709 long avgidle = cl->avgidle; 710 long idle; 711 712 cl->bstats.packets++; 713 cl->bstats.bytes += len; 714 715 /* 716 (now - last) is total time between packet right edges. 717 (last_pktlen/rate) is "virtual" busy time, so that 718 719 idle = (now - last) - last_pktlen/rate 720 */ 721 722 idle = q->now - cl->last; 723 if ((unsigned long)idle > 128*1024*1024) { 724 avgidle = cl->maxidle; 725 } else { 726 idle -= L2T(cl, len); 727 728 /* true_avgidle := (1-W)*true_avgidle + W*idle, 729 where W=2^{-ewma_log}. But cl->avgidle is scaled: 730 cl->avgidle == true_avgidle/W, 731 hence: 732 */ 733 avgidle += idle - (avgidle>>cl->ewma_log); 734 } 735 736 if (avgidle <= 0) { 737 /* Overlimit or at-limit */ 738 739 if (avgidle < cl->minidle) 740 avgidle = cl->minidle; 741 742 cl->avgidle = avgidle; 743 744 /* Calculate expected time, when this class 745 will be allowed to send. 746 It will occur, when: 747 (1-W)*true_avgidle + W*delay = 0, i.e. 748 idle = (1/W - 1)*(-true_avgidle) 749 or 750 idle = (1 - W)*(-cl->avgidle); 751 */ 752 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); 753 754 /* 755 That is not all. 756 To maintain the rate allocated to the class, 757 we add to undertime virtual clock, 758 necessary to complete transmitted packet. 759 (len/phys_bandwidth has been already passed 760 to the moment of cbq_update) 761 */ 762 763 idle -= L2T(&q->link, len); 764 idle += L2T(cl, len); 765 766 cl->undertime = q->now + idle; 767 } else { 768 /* Underlimit */ 769 770 cl->undertime = PSCHED_PASTPERFECT; 771 if (avgidle > cl->maxidle) 772 cl->avgidle = cl->maxidle; 773 else 774 cl->avgidle = avgidle; 775 } 776 cl->last = q->now; 777 } 778 779 cbq_update_toplevel(q, this, q->tx_borrowed); 780} 781 782static __inline__ struct cbq_class * 783cbq_under_limit(struct cbq_class *cl) 784{ 785 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 786 struct cbq_class *this_cl = cl; 787 788 if (cl->tparent == NULL) 789 return cl; 790 791 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) { 792 cl->delayed = 0; 793 return cl; 794 } 795 796 do { 797 /* It is very suspicious place. Now overlimit 798 action is generated for not bounded classes 799 only if link is completely congested. 800 Though it is in agree with ancestor-only paradigm, 801 it looks very stupid. Particularly, 802 it means that this chunk of code will either 803 never be called or result in strong amplification 804 of burstiness. Dangerous, silly, and, however, 805 no another solution exists. 806 */ 807 if ((cl = cl->borrow) == NULL) { 808 this_cl->qstats.overlimits++; 809 this_cl->overlimit(this_cl); 810 return NULL; 811 } 812 if (cl->level > q->toplevel) 813 return NULL; 814 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime); 815 816 cl->delayed = 0; 817 return cl; 818} 819 820static __inline__ struct sk_buff * 821cbq_dequeue_prio(struct Qdisc *sch, int prio) 822{ 823 struct cbq_sched_data *q = qdisc_priv(sch); 824 struct cbq_class *cl_tail, *cl_prev, *cl; 825 struct sk_buff *skb; 826 int deficit; 827 828 cl_tail = cl_prev = q->active[prio]; 829 cl = cl_prev->next_alive; 830 831 do { 832 deficit = 0; 833 834 /* Start round */ 835 do { 836 struct cbq_class *borrow = cl; 837 838 if (cl->q->q.qlen && 839 (borrow = cbq_under_limit(cl)) == NULL) 840 goto skip_class; 841 842 if (cl->deficit <= 0) { 843 /* Class exhausted its allotment per 844 this round. Switch to the next one. 845 */ 846 deficit = 1; 847 cl->deficit += cl->quantum; 848 goto next_class; 849 } 850 851 skb = cl->q->dequeue(cl->q); 852 853 /* Class did not give us any skb :-( 854 It could occur even if cl->q->q.qlen != 0 855 f.e. if cl->q == "tbf" 856 */ 857 if (skb == NULL) 858 goto skip_class; 859 860 cl->deficit -= qdisc_pkt_len(skb); 861 q->tx_class = cl; 862 q->tx_borrowed = borrow; 863 if (borrow != cl) { 864#ifndef CBQ_XSTATS_BORROWS_BYTES 865 borrow->xstats.borrows++; 866 cl->xstats.borrows++; 867#else 868 borrow->xstats.borrows += qdisc_pkt_len(skb); 869 cl->xstats.borrows += qdisc_pkt_len(skb); 870#endif 871 } 872 q->tx_len = qdisc_pkt_len(skb); 873 874 if (cl->deficit <= 0) { 875 q->active[prio] = cl; 876 cl = cl->next_alive; 877 cl->deficit += cl->quantum; 878 } 879 return skb; 880 881skip_class: 882 if (cl->q->q.qlen == 0 || prio != cl->cpriority) { 883 /* Class is empty or penalized. 884 Unlink it from active chain. 885 */ 886 cl_prev->next_alive = cl->next_alive; 887 cl->next_alive = NULL; 888 889 /* Did cl_tail point to it? */ 890 if (cl == cl_tail) { 891 /* Repair it! */ 892 cl_tail = cl_prev; 893 894 /* Was it the last class in this band? */ 895 if (cl == cl_tail) { 896 /* Kill the band! */ 897 q->active[prio] = NULL; 898 q->activemask &= ~(1<<prio); 899 if (cl->q->q.qlen) 900 cbq_activate_class(cl); 901 return NULL; 902 } 903 904 q->active[prio] = cl_tail; 905 } 906 if (cl->q->q.qlen) 907 cbq_activate_class(cl); 908 909 cl = cl_prev; 910 } 911 912next_class: 913 cl_prev = cl; 914 cl = cl->next_alive; 915 } while (cl_prev != cl_tail); 916 } while (deficit); 917 918 q->active[prio] = cl_prev; 919 920 return NULL; 921} 922 923static __inline__ struct sk_buff * 924cbq_dequeue_1(struct Qdisc *sch) 925{ 926 struct cbq_sched_data *q = qdisc_priv(sch); 927 struct sk_buff *skb; 928 unsigned activemask; 929 930 activemask = q->activemask&0xFF; 931 while (activemask) { 932 int prio = ffz(~activemask); 933 activemask &= ~(1<<prio); 934 skb = cbq_dequeue_prio(sch, prio); 935 if (skb) 936 return skb; 937 } 938 return NULL; 939} 940 941static struct sk_buff * 942cbq_dequeue(struct Qdisc *sch) 943{ 944 struct sk_buff *skb; 945 struct cbq_sched_data *q = qdisc_priv(sch); 946 psched_time_t now; 947 psched_tdiff_t incr; 948 949 now = psched_get_time(); 950 incr = now - q->now_rt; 951 952 if (q->tx_class) { 953 psched_tdiff_t incr2; 954 /* Time integrator. We calculate EOS time 955 by adding expected packet transmission time. 956 If real time is greater, we warp artificial clock, 957 so that: 958 959 cbq_time = max(real_time, work); 960 */ 961 incr2 = L2T(&q->link, q->tx_len); 962 q->now += incr2; 963 cbq_update(q); 964 if ((incr -= incr2) < 0) 965 incr = 0; 966 } 967 q->now += incr; 968 q->now_rt = now; 969 970 for (;;) { 971 q->wd_expires = 0; 972 973 skb = cbq_dequeue_1(sch); 974 if (skb) { 975 sch->q.qlen--; 976 sch->flags &= ~TCQ_F_THROTTLED; 977 return skb; 978 } 979 980 /* All the classes are overlimit. 981 982 It is possible, if: 983 984 1. Scheduler is empty. 985 2. Toplevel cutoff inhibited borrowing. 986 3. Root class is overlimit. 987 988 Reset 2d and 3d conditions and retry. 989 990 Note, that NS and cbq-2.0 are buggy, peeking 991 an arbitrary class is appropriate for ancestor-only 992 sharing, but not for toplevel algorithm. 993 994 Our version is better, but slower, because it requires 995 two passes, but it is unavoidable with top-level sharing. 996 */ 997 998 if (q->toplevel == TC_CBQ_MAXLEVEL && 999 q->link.undertime == PSCHED_PASTPERFECT) 1000 break; 1001 1002 q->toplevel = TC_CBQ_MAXLEVEL; 1003 q->link.undertime = PSCHED_PASTPERFECT; 1004 } 1005 1006 /* No packets in scheduler or nobody wants to give them to us :-( 1007 Sigh... start watchdog timer in the last case. */ 1008 1009 if (sch->q.qlen) { 1010 sch->qstats.overlimits++; 1011 if (q->wd_expires) 1012 qdisc_watchdog_schedule(&q->watchdog, 1013 now + q->wd_expires); 1014 } 1015 return NULL; 1016} 1017 1018/* CBQ class maintanance routines */ 1019 1020static void cbq_adjust_levels(struct cbq_class *this) 1021{ 1022 if (this == NULL) 1023 return; 1024 1025 do { 1026 int level = 0; 1027 struct cbq_class *cl; 1028 1029 if ((cl = this->children) != NULL) { 1030 do { 1031 if (cl->level > level) 1032 level = cl->level; 1033 } while ((cl = cl->sibling) != this->children); 1034 } 1035 this->level = level+1; 1036 } while ((this = this->tparent) != NULL); 1037} 1038 1039static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) 1040{ 1041 struct cbq_class *cl; 1042 struct hlist_node *n; 1043 unsigned int h; 1044 1045 if (q->quanta[prio] == 0) 1046 return; 1047 1048 for (h = 0; h < q->clhash.hashsize; h++) { 1049 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { 1050 /* BUGGGG... Beware! This expression suffer of 1051 arithmetic overflows! 1052 */ 1053 if (cl->priority == prio) { 1054 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ 1055 q->quanta[prio]; 1056 } 1057 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { 1058 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum); 1059 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; 1060 } 1061 } 1062 } 1063} 1064 1065static void cbq_sync_defmap(struct cbq_class *cl) 1066{ 1067 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 1068 struct cbq_class *split = cl->split; 1069 unsigned h; 1070 int i; 1071 1072 if (split == NULL) 1073 return; 1074 1075 for (i=0; i<=TC_PRIO_MAX; i++) { 1076 if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) 1077 split->defaults[i] = NULL; 1078 } 1079 1080 for (i=0; i<=TC_PRIO_MAX; i++) { 1081 int level = split->level; 1082 1083 if (split->defaults[i]) 1084 continue; 1085 1086 for (h = 0; h < q->clhash.hashsize; h++) { 1087 struct hlist_node *n; 1088 struct cbq_class *c; 1089 1090 hlist_for_each_entry(c, n, &q->clhash.hash[h], 1091 common.hnode) { 1092 if (c->split == split && c->level < level && 1093 c->defmap&(1<<i)) { 1094 split->defaults[i] = c; 1095 level = c->level; 1096 } 1097 } 1098 } 1099 } 1100} 1101 1102static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) 1103{ 1104 struct cbq_class *split = NULL; 1105 1106 if (splitid == 0) { 1107 if ((split = cl->split) == NULL) 1108 return; 1109 splitid = split->common.classid; 1110 } 1111 1112 if (split == NULL || split->common.classid != splitid) { 1113 for (split = cl->tparent; split; split = split->tparent) 1114 if (split->common.classid == splitid) 1115 break; 1116 } 1117 1118 if (split == NULL) 1119 return; 1120 1121 if (cl->split != split) { 1122 cl->defmap = 0; 1123 cbq_sync_defmap(cl); 1124 cl->split = split; 1125 cl->defmap = def&mask; 1126 } else 1127 cl->defmap = (cl->defmap&~mask)|(def&mask); 1128 1129 cbq_sync_defmap(cl); 1130} 1131 1132static void cbq_unlink_class(struct cbq_class *this) 1133{ 1134 struct cbq_class *cl, **clp; 1135 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 1136 1137 qdisc_class_hash_remove(&q->clhash, &this->common); 1138 1139 if (this->tparent) { 1140 clp=&this->sibling; 1141 cl = *clp; 1142 do { 1143 if (cl == this) { 1144 *clp = cl->sibling; 1145 break; 1146 } 1147 clp = &cl->sibling; 1148 } while ((cl = *clp) != this->sibling); 1149 1150 if (this->tparent->children == this) { 1151 this->tparent->children = this->sibling; 1152 if (this->sibling == this) 1153 this->tparent->children = NULL; 1154 } 1155 } else { 1156 WARN_ON(this->sibling != this); 1157 } 1158} 1159 1160static void cbq_link_class(struct cbq_class *this) 1161{ 1162 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 1163 struct cbq_class *parent = this->tparent; 1164 1165 this->sibling = this; 1166 qdisc_class_hash_insert(&q->clhash, &this->common); 1167 1168 if (parent == NULL) 1169 return; 1170 1171 if (parent->children == NULL) { 1172 parent->children = this; 1173 } else { 1174 this->sibling = parent->children->sibling; 1175 parent->children->sibling = this; 1176 } 1177} 1178 1179static unsigned int cbq_drop(struct Qdisc* sch) 1180{ 1181 struct cbq_sched_data *q = qdisc_priv(sch); 1182 struct cbq_class *cl, *cl_head; 1183 int prio; 1184 unsigned int len; 1185 1186 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { 1187 if ((cl_head = q->active[prio]) == NULL) 1188 continue; 1189 1190 cl = cl_head; 1191 do { 1192 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { 1193 sch->q.qlen--; 1194 if (!cl->q->q.qlen) 1195 cbq_deactivate_class(cl); 1196 return len; 1197 } 1198 } while ((cl = cl->next_alive) != cl_head); 1199 } 1200 return 0; 1201} 1202 1203static void 1204cbq_reset(struct Qdisc* sch) 1205{ 1206 struct cbq_sched_data *q = qdisc_priv(sch); 1207 struct cbq_class *cl; 1208 struct hlist_node *n; 1209 int prio; 1210 unsigned h; 1211 1212 q->activemask = 0; 1213 q->pmask = 0; 1214 q->tx_class = NULL; 1215 q->tx_borrowed = NULL; 1216 qdisc_watchdog_cancel(&q->watchdog); 1217 hrtimer_cancel(&q->delay_timer); 1218 q->toplevel = TC_CBQ_MAXLEVEL; 1219 q->now = psched_get_time(); 1220 q->now_rt = q->now; 1221 1222 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) 1223 q->active[prio] = NULL; 1224 1225 for (h = 0; h < q->clhash.hashsize; h++) { 1226 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { 1227 qdisc_reset(cl->q); 1228 1229 cl->next_alive = NULL; 1230 cl->undertime = PSCHED_PASTPERFECT; 1231 cl->avgidle = cl->maxidle; 1232 cl->deficit = cl->quantum; 1233 cl->cpriority = cl->priority; 1234 } 1235 } 1236 sch->q.qlen = 0; 1237} 1238 1239 1240static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) 1241{ 1242 if (lss->change&TCF_CBQ_LSS_FLAGS) { 1243 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; 1244 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; 1245 } 1246 if (lss->change&TCF_CBQ_LSS_EWMA) 1247 cl->ewma_log = lss->ewma_log; 1248 if (lss->change&TCF_CBQ_LSS_AVPKT) 1249 cl->avpkt = lss->avpkt; 1250 if (lss->change&TCF_CBQ_LSS_MINIDLE) 1251 cl->minidle = -(long)lss->minidle; 1252 if (lss->change&TCF_CBQ_LSS_MAXIDLE) { 1253 cl->maxidle = lss->maxidle; 1254 cl->avgidle = lss->maxidle; 1255 } 1256 if (lss->change&TCF_CBQ_LSS_OFFTIME) 1257 cl->offtime = lss->offtime; 1258 return 0; 1259} 1260 1261static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) 1262{ 1263 q->nclasses[cl->priority]--; 1264 q->quanta[cl->priority] -= cl->weight; 1265 cbq_normalize_quanta(q, cl->priority); 1266} 1267 1268static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) 1269{ 1270 q->nclasses[cl->priority]++; 1271 q->quanta[cl->priority] += cl->weight; 1272 cbq_normalize_quanta(q, cl->priority); 1273} 1274 1275static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) 1276{ 1277 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 1278 1279 if (wrr->allot) 1280 cl->allot = wrr->allot; 1281 if (wrr->weight) 1282 cl->weight = wrr->weight; 1283 if (wrr->priority) { 1284 cl->priority = wrr->priority-1; 1285 cl->cpriority = cl->priority; 1286 if (cl->priority >= cl->priority2) 1287 cl->priority2 = TC_CBQ_MAXPRIO-1; 1288 } 1289 1290 cbq_addprio(q, cl); 1291 return 0; 1292} 1293 1294static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) 1295{ 1296 switch (ovl->strategy) { 1297 case TC_CBQ_OVL_CLASSIC: 1298 cl->overlimit = cbq_ovl_classic; 1299 break; 1300 case TC_CBQ_OVL_DELAY: 1301 cl->overlimit = cbq_ovl_delay; 1302 break; 1303 case TC_CBQ_OVL_LOWPRIO: 1304 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || 1305 ovl->priority2-1 <= cl->priority) 1306 return -EINVAL; 1307 cl->priority2 = ovl->priority2-1; 1308 cl->overlimit = cbq_ovl_lowprio; 1309 break; 1310 case TC_CBQ_OVL_DROP: 1311 cl->overlimit = cbq_ovl_drop; 1312 break; 1313 case TC_CBQ_OVL_RCLASSIC: 1314 cl->overlimit = cbq_ovl_rclassic; 1315 break; 1316 default: 1317 return -EINVAL; 1318 } 1319 cl->penalty = ovl->penalty; 1320 return 0; 1321} 1322 1323#ifdef CONFIG_NET_CLS_ACT 1324static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) 1325{ 1326 cl->police = p->police; 1327 1328 if (cl->q->handle) { 1329 if (p->police == TC_POLICE_RECLASSIFY) 1330 cl->q->reshape_fail = cbq_reshape_fail; 1331 else 1332 cl->q->reshape_fail = NULL; 1333 } 1334 return 0; 1335} 1336#endif 1337 1338static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) 1339{ 1340 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); 1341 return 0; 1342} 1343 1344static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { 1345 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) }, 1346 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) }, 1347 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) }, 1348 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) }, 1349 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) }, 1350 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, 1351 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, 1352}; 1353 1354static int cbq_init(struct Qdisc *sch, struct nlattr *opt) 1355{ 1356 struct cbq_sched_data *q = qdisc_priv(sch); 1357 struct nlattr *tb[TCA_CBQ_MAX + 1]; 1358 struct tc_ratespec *r; 1359 int err; 1360 1361 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); 1362 if (err < 0) 1363 return err; 1364 1365 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL) 1366 return -EINVAL; 1367 1368 r = nla_data(tb[TCA_CBQ_RATE]); 1369 1370 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL) 1371 return -EINVAL; 1372 1373 err = qdisc_class_hash_init(&q->clhash); 1374 if (err < 0) 1375 goto put_rtab; 1376 1377 q->link.refcnt = 1; 1378 q->link.sibling = &q->link; 1379 q->link.common.classid = sch->handle; 1380 q->link.qdisc = sch; 1381 if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1382 &pfifo_qdisc_ops, 1383 sch->handle))) 1384 q->link.q = &noop_qdisc; 1385 1386 q->link.priority = TC_CBQ_MAXPRIO-1; 1387 q->link.priority2 = TC_CBQ_MAXPRIO-1; 1388 q->link.cpriority = TC_CBQ_MAXPRIO-1; 1389 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; 1390 q->link.overlimit = cbq_ovl_classic; 1391 q->link.allot = psched_mtu(qdisc_dev(sch)); 1392 q->link.quantum = q->link.allot; 1393 q->link.weight = q->link.R_tab->rate.rate; 1394 1395 q->link.ewma_log = TC_CBQ_DEF_EWMA; 1396 q->link.avpkt = q->link.allot/2; 1397 q->link.minidle = -0x7FFFFFFF; 1398 1399 qdisc_watchdog_init(&q->watchdog, sch); 1400 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1401 q->delay_timer.function = cbq_undelay; 1402 q->toplevel = TC_CBQ_MAXLEVEL; 1403 q->now = psched_get_time(); 1404 q->now_rt = q->now; 1405 1406 cbq_link_class(&q->link); 1407 1408 if (tb[TCA_CBQ_LSSOPT]) 1409 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT])); 1410 1411 cbq_addprio(q, &q->link); 1412 return 0; 1413 1414put_rtab: 1415 qdisc_put_rtab(q->link.R_tab); 1416 return err; 1417} 1418 1419static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) 1420{ 1421 unsigned char *b = skb_tail_pointer(skb); 1422 1423 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); 1424 return skb->len; 1425 1426nla_put_failure: 1427 nlmsg_trim(skb, b); 1428 return -1; 1429} 1430 1431static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) 1432{ 1433 unsigned char *b = skb_tail_pointer(skb); 1434 struct tc_cbq_lssopt opt; 1435 1436 opt.flags = 0; 1437 if (cl->borrow == NULL) 1438 opt.flags |= TCF_CBQ_LSS_BOUNDED; 1439 if (cl->share == NULL) 1440 opt.flags |= TCF_CBQ_LSS_ISOLATED; 1441 opt.ewma_log = cl->ewma_log; 1442 opt.level = cl->level; 1443 opt.avpkt = cl->avpkt; 1444 opt.maxidle = cl->maxidle; 1445 opt.minidle = (u32)(-cl->minidle); 1446 opt.offtime = cl->offtime; 1447 opt.change = ~0; 1448 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); 1449 return skb->len; 1450 1451nla_put_failure: 1452 nlmsg_trim(skb, b); 1453 return -1; 1454} 1455 1456static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) 1457{ 1458 unsigned char *b = skb_tail_pointer(skb); 1459 struct tc_cbq_wrropt opt; 1460 1461 opt.flags = 0; 1462 opt.allot = cl->allot; 1463 opt.priority = cl->priority+1; 1464 opt.cpriority = cl->cpriority+1; 1465 opt.weight = cl->weight; 1466 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); 1467 return skb->len; 1468 1469nla_put_failure: 1470 nlmsg_trim(skb, b); 1471 return -1; 1472} 1473 1474static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) 1475{ 1476 unsigned char *b = skb_tail_pointer(skb); 1477 struct tc_cbq_ovl opt; 1478 1479 opt.strategy = cl->ovl_strategy; 1480 opt.priority2 = cl->priority2+1; 1481 opt.pad = 0; 1482 opt.penalty = cl->penalty; 1483 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); 1484 return skb->len; 1485 1486nla_put_failure: 1487 nlmsg_trim(skb, b); 1488 return -1; 1489} 1490 1491static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) 1492{ 1493 unsigned char *b = skb_tail_pointer(skb); 1494 struct tc_cbq_fopt opt; 1495 1496 if (cl->split || cl->defmap) { 1497 opt.split = cl->split ? cl->split->common.classid : 0; 1498 opt.defmap = cl->defmap; 1499 opt.defchange = ~0; 1500 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); 1501 } 1502 return skb->len; 1503 1504nla_put_failure: 1505 nlmsg_trim(skb, b); 1506 return -1; 1507} 1508 1509#ifdef CONFIG_NET_CLS_ACT 1510static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) 1511{ 1512 unsigned char *b = skb_tail_pointer(skb); 1513 struct tc_cbq_police opt; 1514 1515 if (cl->police) { 1516 opt.police = cl->police; 1517 opt.__res1 = 0; 1518 opt.__res2 = 0; 1519 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); 1520 } 1521 return skb->len; 1522 1523nla_put_failure: 1524 nlmsg_trim(skb, b); 1525 return -1; 1526} 1527#endif 1528 1529static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) 1530{ 1531 if (cbq_dump_lss(skb, cl) < 0 || 1532 cbq_dump_rate(skb, cl) < 0 || 1533 cbq_dump_wrr(skb, cl) < 0 || 1534 cbq_dump_ovl(skb, cl) < 0 || 1535#ifdef CONFIG_NET_CLS_ACT 1536 cbq_dump_police(skb, cl) < 0 || 1537#endif 1538 cbq_dump_fopt(skb, cl) < 0) 1539 return -1; 1540 return 0; 1541} 1542 1543static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) 1544{ 1545 struct cbq_sched_data *q = qdisc_priv(sch); 1546 struct nlattr *nest; 1547 1548 nest = nla_nest_start(skb, TCA_OPTIONS); 1549 if (nest == NULL) 1550 goto nla_put_failure; 1551 if (cbq_dump_attr(skb, &q->link) < 0) 1552 goto nla_put_failure; 1553 nla_nest_end(skb, nest); 1554 return skb->len; 1555 1556nla_put_failure: 1557 nla_nest_cancel(skb, nest); 1558 return -1; 1559} 1560 1561static int 1562cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 1563{ 1564 struct cbq_sched_data *q = qdisc_priv(sch); 1565 1566 q->link.xstats.avgidle = q->link.avgidle; 1567 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); 1568} 1569 1570static int 1571cbq_dump_class(struct Qdisc *sch, unsigned long arg, 1572 struct sk_buff *skb, struct tcmsg *tcm) 1573{ 1574 struct cbq_class *cl = (struct cbq_class*)arg; 1575 struct nlattr *nest; 1576 1577 if (cl->tparent) 1578 tcm->tcm_parent = cl->tparent->common.classid; 1579 else 1580 tcm->tcm_parent = TC_H_ROOT; 1581 tcm->tcm_handle = cl->common.classid; 1582 tcm->tcm_info = cl->q->handle; 1583 1584 nest = nla_nest_start(skb, TCA_OPTIONS); 1585 if (nest == NULL) 1586 goto nla_put_failure; 1587 if (cbq_dump_attr(skb, cl) < 0) 1588 goto nla_put_failure; 1589 nla_nest_end(skb, nest); 1590 return skb->len; 1591 1592nla_put_failure: 1593 nla_nest_cancel(skb, nest); 1594 return -1; 1595} 1596 1597static int 1598cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, 1599 struct gnet_dump *d) 1600{ 1601 struct cbq_sched_data *q = qdisc_priv(sch); 1602 struct cbq_class *cl = (struct cbq_class*)arg; 1603 1604 cl->qstats.qlen = cl->q->q.qlen; 1605 cl->xstats.avgidle = cl->avgidle; 1606 cl->xstats.undertime = 0; 1607 1608 if (cl->undertime != PSCHED_PASTPERFECT) 1609 cl->xstats.undertime = cl->undertime - q->now; 1610 1611 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || 1612 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || 1613 gnet_stats_copy_queue(d, &cl->qstats) < 0) 1614 return -1; 1615 1616 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); 1617} 1618 1619static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1620 struct Qdisc **old) 1621{ 1622 struct cbq_class *cl = (struct cbq_class*)arg; 1623 1624 if (cl) { 1625 if (new == NULL) { 1626 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1627 &pfifo_qdisc_ops, 1628 cl->common.classid); 1629 if (new == NULL) 1630 return -ENOBUFS; 1631 } else { 1632#ifdef CONFIG_NET_CLS_ACT 1633 if (cl->police == TC_POLICE_RECLASSIFY) 1634 new->reshape_fail = cbq_reshape_fail; 1635#endif 1636 } 1637 sch_tree_lock(sch); 1638 *old = cl->q; 1639 cl->q = new; 1640 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); 1641 qdisc_reset(*old); 1642 sch_tree_unlock(sch); 1643 1644 return 0; 1645 } 1646 return -ENOENT; 1647} 1648 1649static struct Qdisc * 1650cbq_leaf(struct Qdisc *sch, unsigned long arg) 1651{ 1652 struct cbq_class *cl = (struct cbq_class*)arg; 1653 1654 return cl ? cl->q : NULL; 1655} 1656 1657static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg) 1658{ 1659 struct cbq_class *cl = (struct cbq_class *)arg; 1660 1661 if (cl->q->q.qlen == 0) 1662 cbq_deactivate_class(cl); 1663} 1664 1665static unsigned long cbq_get(struct Qdisc *sch, u32 classid) 1666{ 1667 struct cbq_sched_data *q = qdisc_priv(sch); 1668 struct cbq_class *cl = cbq_class_lookup(q, classid); 1669 1670 if (cl) { 1671 cl->refcnt++; 1672 return (unsigned long)cl; 1673 } 1674 return 0; 1675} 1676 1677static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) 1678{ 1679 struct cbq_sched_data *q = qdisc_priv(sch); 1680 1681 WARN_ON(cl->filters); 1682 1683 tcf_destroy_chain(&cl->filter_list); 1684 qdisc_destroy(cl->q); 1685 qdisc_put_rtab(cl->R_tab); 1686 gen_kill_estimator(&cl->bstats, &cl->rate_est); 1687 if (cl != &q->link) 1688 kfree(cl); 1689} 1690 1691static void 1692cbq_destroy(struct Qdisc* sch) 1693{ 1694 struct cbq_sched_data *q = qdisc_priv(sch); 1695 struct hlist_node *n, *next; 1696 struct cbq_class *cl; 1697 unsigned h; 1698 1699#ifdef CONFIG_NET_CLS_ACT 1700 q->rx_class = NULL; 1701#endif 1702 /* 1703 * Filters must be destroyed first because we don't destroy the 1704 * classes from root to leafs which means that filters can still 1705 * be bound to classes which have been destroyed already. --TGR '04 1706 */ 1707 for (h = 0; h < q->clhash.hashsize; h++) { 1708 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) 1709 tcf_destroy_chain(&cl->filter_list); 1710 } 1711 for (h = 0; h < q->clhash.hashsize; h++) { 1712 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h], 1713 common.hnode) 1714 cbq_destroy_class(sch, cl); 1715 } 1716 qdisc_class_hash_destroy(&q->clhash); 1717} 1718 1719static void cbq_put(struct Qdisc *sch, unsigned long arg) 1720{ 1721 struct cbq_class *cl = (struct cbq_class*)arg; 1722 1723 if (--cl->refcnt == 0) { 1724#ifdef CONFIG_NET_CLS_ACT 1725 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); 1726 struct cbq_sched_data *q = qdisc_priv(sch); 1727 1728 spin_lock_bh(root_lock); 1729 if (q->rx_class == cl) 1730 q->rx_class = NULL; 1731 spin_unlock_bh(root_lock); 1732#endif 1733 1734 cbq_destroy_class(sch, cl); 1735 } 1736} 1737 1738static int 1739cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, 1740 unsigned long *arg) 1741{ 1742 int err; 1743 struct cbq_sched_data *q = qdisc_priv(sch); 1744 struct cbq_class *cl = (struct cbq_class*)*arg; 1745 struct nlattr *opt = tca[TCA_OPTIONS]; 1746 struct nlattr *tb[TCA_CBQ_MAX + 1]; 1747 struct cbq_class *parent; 1748 struct qdisc_rate_table *rtab = NULL; 1749 1750 if (opt == NULL) 1751 return -EINVAL; 1752 1753 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); 1754 if (err < 0) 1755 return err; 1756 1757 if (cl) { 1758 /* Check parent */ 1759 if (parentid) { 1760 if (cl->tparent && 1761 cl->tparent->common.classid != parentid) 1762 return -EINVAL; 1763 if (!cl->tparent && parentid != TC_H_ROOT) 1764 return -EINVAL; 1765 } 1766 1767 if (tb[TCA_CBQ_RATE]) { 1768 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), 1769 tb[TCA_CBQ_RTAB]); 1770 if (rtab == NULL) 1771 return -EINVAL; 1772 } 1773 1774 if (tca[TCA_RATE]) { 1775 err = gen_replace_estimator(&cl->bstats, &cl->rate_est, 1776 qdisc_root_sleeping_lock(sch), 1777 tca[TCA_RATE]); 1778 if (err) { 1779 if (rtab) 1780 qdisc_put_rtab(rtab); 1781 return err; 1782 } 1783 } 1784 1785 /* Change class parameters */ 1786 sch_tree_lock(sch); 1787 1788 if (cl->next_alive != NULL) 1789 cbq_deactivate_class(cl); 1790 1791 if (rtab) { 1792 qdisc_put_rtab(cl->R_tab); 1793 cl->R_tab = rtab; 1794 } 1795 1796 if (tb[TCA_CBQ_LSSOPT]) 1797 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); 1798 1799 if (tb[TCA_CBQ_WRROPT]) { 1800 cbq_rmprio(q, cl); 1801 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); 1802 } 1803 1804 if (tb[TCA_CBQ_OVL_STRATEGY]) 1805 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); 1806 1807#ifdef CONFIG_NET_CLS_ACT 1808 if (tb[TCA_CBQ_POLICE]) 1809 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); 1810#endif 1811 1812 if (tb[TCA_CBQ_FOPT]) 1813 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); 1814 1815 if (cl->q->q.qlen) 1816 cbq_activate_class(cl); 1817 1818 sch_tree_unlock(sch); 1819 1820 return 0; 1821 } 1822 1823 if (parentid == TC_H_ROOT) 1824 return -EINVAL; 1825 1826 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL || 1827 tb[TCA_CBQ_LSSOPT] == NULL) 1828 return -EINVAL; 1829 1830 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]); 1831 if (rtab == NULL) 1832 return -EINVAL; 1833 1834 if (classid) { 1835 err = -EINVAL; 1836 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) 1837 goto failure; 1838 } else { 1839 int i; 1840 classid = TC_H_MAKE(sch->handle,0x8000); 1841 1842 for (i=0; i<0x8000; i++) { 1843 if (++q->hgenerator >= 0x8000) 1844 q->hgenerator = 1; 1845 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) 1846 break; 1847 } 1848 err = -ENOSR; 1849 if (i >= 0x8000) 1850 goto failure; 1851 classid = classid|q->hgenerator; 1852 } 1853 1854 parent = &q->link; 1855 if (parentid) { 1856 parent = cbq_class_lookup(q, parentid); 1857 err = -EINVAL; 1858 if (parent == NULL) 1859 goto failure; 1860 } 1861 1862 err = -ENOBUFS; 1863 cl = kzalloc(sizeof(*cl), GFP_KERNEL); 1864 if (cl == NULL) 1865 goto failure; 1866 1867 if (tca[TCA_RATE]) { 1868 err = gen_new_estimator(&cl->bstats, &cl->rate_est, 1869 qdisc_root_sleeping_lock(sch), 1870 tca[TCA_RATE]); 1871 if (err) { 1872 kfree(cl); 1873 goto failure; 1874 } 1875 } 1876 1877 cl->R_tab = rtab; 1878 rtab = NULL; 1879 cl->refcnt = 1; 1880 if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1881 &pfifo_qdisc_ops, classid))) 1882 cl->q = &noop_qdisc; 1883 cl->common.classid = classid; 1884 cl->tparent = parent; 1885 cl->qdisc = sch; 1886 cl->allot = parent->allot; 1887 cl->quantum = cl->allot; 1888 cl->weight = cl->R_tab->rate.rate; 1889 1890 sch_tree_lock(sch); 1891 cbq_link_class(cl); 1892 cl->borrow = cl->tparent; 1893 if (cl->tparent != &q->link) 1894 cl->share = cl->tparent; 1895 cbq_adjust_levels(parent); 1896 cl->minidle = -0x7FFFFFFF; 1897 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); 1898 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); 1899 if (cl->ewma_log==0) 1900 cl->ewma_log = q->link.ewma_log; 1901 if (cl->maxidle==0) 1902 cl->maxidle = q->link.maxidle; 1903 if (cl->avpkt==0) 1904 cl->avpkt = q->link.avpkt; 1905 cl->overlimit = cbq_ovl_classic; 1906 if (tb[TCA_CBQ_OVL_STRATEGY]) 1907 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); 1908#ifdef CONFIG_NET_CLS_ACT 1909 if (tb[TCA_CBQ_POLICE]) 1910 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); 1911#endif 1912 if (tb[TCA_CBQ_FOPT]) 1913 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); 1914 sch_tree_unlock(sch); 1915 1916 qdisc_class_hash_grow(sch, &q->clhash); 1917 1918 *arg = (unsigned long)cl; 1919 return 0; 1920 1921failure: 1922 qdisc_put_rtab(rtab); 1923 return err; 1924} 1925 1926static int cbq_delete(struct Qdisc *sch, unsigned long arg) 1927{ 1928 struct cbq_sched_data *q = qdisc_priv(sch); 1929 struct cbq_class *cl = (struct cbq_class*)arg; 1930 unsigned int qlen; 1931 1932 if (cl->filters || cl->children || cl == &q->link) 1933 return -EBUSY; 1934 1935 sch_tree_lock(sch); 1936 1937 qlen = cl->q->q.qlen; 1938 qdisc_reset(cl->q); 1939 qdisc_tree_decrease_qlen(cl->q, qlen); 1940 1941 if (cl->next_alive) 1942 cbq_deactivate_class(cl); 1943 1944 if (q->tx_borrowed == cl) 1945 q->tx_borrowed = q->tx_class; 1946 if (q->tx_class == cl) { 1947 q->tx_class = NULL; 1948 q->tx_borrowed = NULL; 1949 } 1950#ifdef CONFIG_NET_CLS_ACT 1951 if (q->rx_class == cl) 1952 q->rx_class = NULL; 1953#endif 1954 1955 cbq_unlink_class(cl); 1956 cbq_adjust_levels(cl->tparent); 1957 cl->defmap = 0; 1958 cbq_sync_defmap(cl); 1959 1960 cbq_rmprio(q, cl); 1961 sch_tree_unlock(sch); 1962 1963 BUG_ON(--cl->refcnt == 0); 1964 /* 1965 * This shouldn't happen: we "hold" one cops->get() when called 1966 * from tc_ctl_tclass; the destroy method is done from cops->put(). 1967 */ 1968 1969 return 0; 1970} 1971 1972static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) 1973{ 1974 struct cbq_sched_data *q = qdisc_priv(sch); 1975 struct cbq_class *cl = (struct cbq_class *)arg; 1976 1977 if (cl == NULL) 1978 cl = &q->link; 1979 1980 return &cl->filter_list; 1981} 1982 1983static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, 1984 u32 classid) 1985{ 1986 struct cbq_sched_data *q = qdisc_priv(sch); 1987 struct cbq_class *p = (struct cbq_class*)parent; 1988 struct cbq_class *cl = cbq_class_lookup(q, classid); 1989 1990 if (cl) { 1991 if (p && p->level <= cl->level) 1992 return 0; 1993 cl->filters++; 1994 return (unsigned long)cl; 1995 } 1996 return 0; 1997} 1998 1999static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) 2000{ 2001 struct cbq_class *cl = (struct cbq_class*)arg; 2002 2003 cl->filters--; 2004} 2005 2006static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) 2007{ 2008 struct cbq_sched_data *q = qdisc_priv(sch); 2009 struct cbq_class *cl; 2010 struct hlist_node *n; 2011 unsigned h; 2012 2013 if (arg->stop) 2014 return; 2015 2016 for (h = 0; h < q->clhash.hashsize; h++) { 2017 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { 2018 if (arg->count < arg->skip) { 2019 arg->count++; 2020 continue; 2021 } 2022 if (arg->fn(sch, (unsigned long)cl, arg) < 0) { 2023 arg->stop = 1; 2024 return; 2025 } 2026 arg->count++; 2027 } 2028 } 2029} 2030 2031static const struct Qdisc_class_ops cbq_class_ops = { 2032 .graft = cbq_graft, 2033 .leaf = cbq_leaf, 2034 .qlen_notify = cbq_qlen_notify, 2035 .get = cbq_get, 2036 .put = cbq_put, 2037 .change = cbq_change_class, 2038 .delete = cbq_delete, 2039 .walk = cbq_walk, 2040 .tcf_chain = cbq_find_tcf, 2041 .bind_tcf = cbq_bind_filter, 2042 .unbind_tcf = cbq_unbind_filter, 2043 .dump = cbq_dump_class, 2044 .dump_stats = cbq_dump_class_stats, 2045}; 2046 2047static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { 2048 .next = NULL, 2049 .cl_ops = &cbq_class_ops, 2050 .id = "cbq", 2051 .priv_size = sizeof(struct cbq_sched_data), 2052 .enqueue = cbq_enqueue, 2053 .dequeue = cbq_dequeue, 2054 .peek = qdisc_peek_dequeued, 2055 .drop = cbq_drop, 2056 .init = cbq_init, 2057 .reset = cbq_reset, 2058 .destroy = cbq_destroy, 2059 .change = NULL, 2060 .dump = cbq_dump, 2061 .dump_stats = cbq_dump_stats, 2062 .owner = THIS_MODULE, 2063}; 2064 2065static int __init cbq_module_init(void) 2066{ 2067 return register_qdisc(&cbq_qdisc_ops); 2068} 2069static void __exit cbq_module_exit(void) 2070{ 2071 unregister_qdisc(&cbq_qdisc_ops); 2072} 2073module_init(cbq_module_init) 2074module_exit(cbq_module_exit) 2075MODULE_LICENSE("GPL");