Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v2.6.31-rc8 2078 lines 49 kB view raw
1/* 2 * net/sched/sch_cbq.c Class-Based Queueing discipline. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * 11 */ 12 13#include <linux/module.h> 14#include <linux/types.h> 15#include <linux/kernel.h> 16#include <linux/string.h> 17#include <linux/errno.h> 18#include <linux/skbuff.h> 19#include <net/netlink.h> 20#include <net/pkt_sched.h> 21 22 23/* Class-Based Queueing (CBQ) algorithm. 24 ======================================= 25 26 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource 27 Management Models for Packet Networks", 28 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 29 30 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 31 32 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting 33 Parameters", 1996 34 35 [4] Sally Floyd and Michael Speer, "Experimental Results 36 for Class-Based Queueing", 1998, not published. 37 38 ----------------------------------------------------------------------- 39 40 Algorithm skeleton was taken from NS simulator cbq.cc. 41 If someone wants to check this code against the LBL version, 42 he should take into account that ONLY the skeleton was borrowed, 43 the implementation is different. Particularly: 44 45 --- The WRR algorithm is different. Our version looks more 46 reasonable (I hope) and works when quanta are allowed to be 47 less than MTU, which is always the case when real time classes 48 have small rates. Note, that the statement of [3] is 49 incomplete, delay may actually be estimated even if class 50 per-round allotment is less than MTU. Namely, if per-round 51 allotment is W*r_i, and r_1+...+r_k = r < 1 52 53 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B 54 55 In the worst case we have IntServ estimate with D = W*r+k*MTU 56 and C = MTU*r. The proof (if correct at all) is trivial. 57 58 59 --- It seems that cbq-2.0 is not very accurate. At least, I cannot 60 interpret some places, which look like wrong translations 61 from NS. Anyone is advised to find these differences 62 and explain to me, why I am wrong 8). 63 64 --- Linux has no EOI event, so that we cannot estimate true class 65 idle time. Workaround is to consider the next dequeue event 66 as sign that previous packet is finished. This is wrong because of 67 internal device queueing, but on a permanently loaded link it is true. 68 Moreover, combined with clock integrator, this scheme looks 69 very close to an ideal solution. */ 70 71struct cbq_sched_data; 72 73 74struct cbq_class 75{ 76 struct Qdisc_class_common common; 77 struct cbq_class *next_alive; /* next class with backlog in this priority band */ 78 79/* Parameters */ 80 unsigned char priority; /* class priority */ 81 unsigned char priority2; /* priority to be used after overlimit */ 82 unsigned char ewma_log; /* time constant for idle time calculation */ 83 unsigned char ovl_strategy; 84#ifdef CONFIG_NET_CLS_ACT 85 unsigned char police; 86#endif 87 88 u32 defmap; 89 90 /* Link-sharing scheduler parameters */ 91 long maxidle; /* Class parameters: see below. */ 92 long offtime; 93 long minidle; 94 u32 avpkt; 95 struct qdisc_rate_table *R_tab; 96 97 /* Overlimit strategy parameters */ 98 void (*overlimit)(struct cbq_class *cl); 99 psched_tdiff_t penalty; 100 101 /* General scheduler (WRR) parameters */ 102 long allot; 103 long quantum; /* Allotment per WRR round */ 104 long weight; /* Relative allotment: see below */ 105 106 struct Qdisc *qdisc; /* Ptr to CBQ discipline */ 107 struct cbq_class *split; /* Ptr to split node */ 108 struct cbq_class *share; /* Ptr to LS parent in the class tree */ 109 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ 110 struct cbq_class *borrow; /* NULL if class is bandwidth limited; 111 parent otherwise */ 112 struct cbq_class *sibling; /* Sibling chain */ 113 struct cbq_class *children; /* Pointer to children chain */ 114 115 struct Qdisc *q; /* Elementary queueing discipline */ 116 117 118/* Variables */ 119 unsigned char cpriority; /* Effective priority */ 120 unsigned char delayed; 121 unsigned char level; /* level of the class in hierarchy: 122 0 for leaf classes, and maximal 123 level of children + 1 for nodes. 124 */ 125 126 psched_time_t last; /* Last end of service */ 127 psched_time_t undertime; 128 long avgidle; 129 long deficit; /* Saved deficit for WRR */ 130 psched_time_t penalized; 131 struct gnet_stats_basic_packed bstats; 132 struct gnet_stats_queue qstats; 133 struct gnet_stats_rate_est rate_est; 134 struct tc_cbq_xstats xstats; 135 136 struct tcf_proto *filter_list; 137 138 int refcnt; 139 int filters; 140 141 struct cbq_class *defaults[TC_PRIO_MAX+1]; 142}; 143 144struct cbq_sched_data 145{ 146 struct Qdisc_class_hash clhash; /* Hash table of all classes */ 147 int nclasses[TC_CBQ_MAXPRIO+1]; 148 unsigned quanta[TC_CBQ_MAXPRIO+1]; 149 150 struct cbq_class link; 151 152 unsigned activemask; 153 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes 154 with backlog */ 155 156#ifdef CONFIG_NET_CLS_ACT 157 struct cbq_class *rx_class; 158#endif 159 struct cbq_class *tx_class; 160 struct cbq_class *tx_borrowed; 161 int tx_len; 162 psched_time_t now; /* Cached timestamp */ 163 psched_time_t now_rt; /* Cached real time */ 164 unsigned pmask; 165 166 struct tasklet_hrtimer delay_timer; 167 struct qdisc_watchdog watchdog; /* Watchdog timer, 168 started when CBQ has 169 backlog, but cannot 170 transmit just now */ 171 psched_tdiff_t wd_expires; 172 int toplevel; 173 u32 hgenerator; 174}; 175 176 177#define L2T(cl,len) qdisc_l2t((cl)->R_tab,len) 178 179static __inline__ struct cbq_class * 180cbq_class_lookup(struct cbq_sched_data *q, u32 classid) 181{ 182 struct Qdisc_class_common *clc; 183 184 clc = qdisc_class_find(&q->clhash, classid); 185 if (clc == NULL) 186 return NULL; 187 return container_of(clc, struct cbq_class, common); 188} 189 190#ifdef CONFIG_NET_CLS_ACT 191 192static struct cbq_class * 193cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) 194{ 195 struct cbq_class *cl, *new; 196 197 for (cl = this->tparent; cl; cl = cl->tparent) 198 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) 199 return new; 200 201 return NULL; 202} 203 204#endif 205 206/* Classify packet. The procedure is pretty complicated, but 207 it allows us to combine link sharing and priority scheduling 208 transparently. 209 210 Namely, you can put link sharing rules (f.e. route based) at root of CBQ, 211 so that it resolves to split nodes. Then packets are classified 212 by logical priority, or a more specific classifier may be attached 213 to the split node. 214 */ 215 216static struct cbq_class * 217cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) 218{ 219 struct cbq_sched_data *q = qdisc_priv(sch); 220 struct cbq_class *head = &q->link; 221 struct cbq_class **defmap; 222 struct cbq_class *cl = NULL; 223 u32 prio = skb->priority; 224 struct tcf_result res; 225 226 /* 227 * Step 1. If skb->priority points to one of our classes, use it. 228 */ 229 if (TC_H_MAJ(prio^sch->handle) == 0 && 230 (cl = cbq_class_lookup(q, prio)) != NULL) 231 return cl; 232 233 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 234 for (;;) { 235 int result = 0; 236 defmap = head->defaults; 237 238 /* 239 * Step 2+n. Apply classifier. 240 */ 241 if (!head->filter_list || 242 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) 243 goto fallback; 244 245 if ((cl = (void*)res.class) == NULL) { 246 if (TC_H_MAJ(res.classid)) 247 cl = cbq_class_lookup(q, res.classid); 248 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) 249 cl = defmap[TC_PRIO_BESTEFFORT]; 250 251 if (cl == NULL || cl->level >= head->level) 252 goto fallback; 253 } 254 255#ifdef CONFIG_NET_CLS_ACT 256 switch (result) { 257 case TC_ACT_QUEUED: 258 case TC_ACT_STOLEN: 259 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; 260 case TC_ACT_SHOT: 261 return NULL; 262 case TC_ACT_RECLASSIFY: 263 return cbq_reclassify(skb, cl); 264 } 265#endif 266 if (cl->level == 0) 267 return cl; 268 269 /* 270 * Step 3+n. If classifier selected a link sharing class, 271 * apply agency specific classifier. 272 * Repeat this procdure until we hit a leaf node. 273 */ 274 head = cl; 275 } 276 277fallback: 278 cl = head; 279 280 /* 281 * Step 4. No success... 282 */ 283 if (TC_H_MAJ(prio) == 0 && 284 !(cl = head->defaults[prio&TC_PRIO_MAX]) && 285 !(cl = head->defaults[TC_PRIO_BESTEFFORT])) 286 return head; 287 288 return cl; 289} 290 291/* 292 A packet has just been enqueued on the empty class. 293 cbq_activate_class adds it to the tail of active class list 294 of its priority band. 295 */ 296 297static __inline__ void cbq_activate_class(struct cbq_class *cl) 298{ 299 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 300 int prio = cl->cpriority; 301 struct cbq_class *cl_tail; 302 303 cl_tail = q->active[prio]; 304 q->active[prio] = cl; 305 306 if (cl_tail != NULL) { 307 cl->next_alive = cl_tail->next_alive; 308 cl_tail->next_alive = cl; 309 } else { 310 cl->next_alive = cl; 311 q->activemask |= (1<<prio); 312 } 313} 314 315/* 316 Unlink class from active chain. 317 Note that this same procedure is done directly in cbq_dequeue* 318 during round-robin procedure. 319 */ 320 321static void cbq_deactivate_class(struct cbq_class *this) 322{ 323 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 324 int prio = this->cpriority; 325 struct cbq_class *cl; 326 struct cbq_class *cl_prev = q->active[prio]; 327 328 do { 329 cl = cl_prev->next_alive; 330 if (cl == this) { 331 cl_prev->next_alive = cl->next_alive; 332 cl->next_alive = NULL; 333 334 if (cl == q->active[prio]) { 335 q->active[prio] = cl_prev; 336 if (cl == q->active[prio]) { 337 q->active[prio] = NULL; 338 q->activemask &= ~(1<<prio); 339 return; 340 } 341 } 342 return; 343 } 344 } while ((cl_prev = cl) != q->active[prio]); 345} 346 347static void 348cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) 349{ 350 int toplevel = q->toplevel; 351 352 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { 353 psched_time_t now; 354 psched_tdiff_t incr; 355 356 now = psched_get_time(); 357 incr = now - q->now_rt; 358 now = q->now + incr; 359 360 do { 361 if (cl->undertime < now) { 362 q->toplevel = cl->level; 363 return; 364 } 365 } while ((cl=cl->borrow) != NULL && toplevel > cl->level); 366 } 367} 368 369static int 370cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 371{ 372 struct cbq_sched_data *q = qdisc_priv(sch); 373 int uninitialized_var(ret); 374 struct cbq_class *cl = cbq_classify(skb, sch, &ret); 375 376#ifdef CONFIG_NET_CLS_ACT 377 q->rx_class = cl; 378#endif 379 if (cl == NULL) { 380 if (ret & __NET_XMIT_BYPASS) 381 sch->qstats.drops++; 382 kfree_skb(skb); 383 return ret; 384 } 385 386#ifdef CONFIG_NET_CLS_ACT 387 cl->q->__parent = sch; 388#endif 389 ret = qdisc_enqueue(skb, cl->q); 390 if (ret == NET_XMIT_SUCCESS) { 391 sch->q.qlen++; 392 sch->bstats.packets++; 393 sch->bstats.bytes += qdisc_pkt_len(skb); 394 cbq_mark_toplevel(q, cl); 395 if (!cl->next_alive) 396 cbq_activate_class(cl); 397 return ret; 398 } 399 400 if (net_xmit_drop_count(ret)) { 401 sch->qstats.drops++; 402 cbq_mark_toplevel(q, cl); 403 cl->qstats.drops++; 404 } 405 return ret; 406} 407 408/* Overlimit actions */ 409 410/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ 411 412static void cbq_ovl_classic(struct cbq_class *cl) 413{ 414 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 415 psched_tdiff_t delay = cl->undertime - q->now; 416 417 if (!cl->delayed) { 418 delay += cl->offtime; 419 420 /* 421 Class goes to sleep, so that it will have no 422 chance to work avgidle. Let's forgive it 8) 423 424 BTW cbq-2.0 has a crap in this 425 place, apparently they forgot to shift it by cl->ewma_log. 426 */ 427 if (cl->avgidle < 0) 428 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); 429 if (cl->avgidle < cl->minidle) 430 cl->avgidle = cl->minidle; 431 if (delay <= 0) 432 delay = 1; 433 cl->undertime = q->now + delay; 434 435 cl->xstats.overactions++; 436 cl->delayed = 1; 437 } 438 if (q->wd_expires == 0 || q->wd_expires > delay) 439 q->wd_expires = delay; 440 441 /* Dirty work! We must schedule wakeups based on 442 real available rate, rather than leaf rate, 443 which may be tiny (even zero). 444 */ 445 if (q->toplevel == TC_CBQ_MAXLEVEL) { 446 struct cbq_class *b; 447 psched_tdiff_t base_delay = q->wd_expires; 448 449 for (b = cl->borrow; b; b = b->borrow) { 450 delay = b->undertime - q->now; 451 if (delay < base_delay) { 452 if (delay <= 0) 453 delay = 1; 454 base_delay = delay; 455 } 456 } 457 458 q->wd_expires = base_delay; 459 } 460} 461 462/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when 463 they go overlimit 464 */ 465 466static void cbq_ovl_rclassic(struct cbq_class *cl) 467{ 468 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 469 struct cbq_class *this = cl; 470 471 do { 472 if (cl->level > q->toplevel) { 473 cl = NULL; 474 break; 475 } 476 } while ((cl = cl->borrow) != NULL); 477 478 if (cl == NULL) 479 cl = this; 480 cbq_ovl_classic(cl); 481} 482 483/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ 484 485static void cbq_ovl_delay(struct cbq_class *cl) 486{ 487 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 488 psched_tdiff_t delay = cl->undertime - q->now; 489 490 if (test_bit(__QDISC_STATE_DEACTIVATED, 491 &qdisc_root_sleeping(cl->qdisc)->state)) 492 return; 493 494 if (!cl->delayed) { 495 psched_time_t sched = q->now; 496 ktime_t expires; 497 498 delay += cl->offtime; 499 if (cl->avgidle < 0) 500 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); 501 if (cl->avgidle < cl->minidle) 502 cl->avgidle = cl->minidle; 503 cl->undertime = q->now + delay; 504 505 if (delay > 0) { 506 struct hrtimer *ht; 507 508 sched += delay + cl->penalty; 509 cl->penalized = sched; 510 cl->cpriority = TC_CBQ_MAXPRIO; 511 q->pmask |= (1<<TC_CBQ_MAXPRIO); 512 513 expires = ktime_set(0, 0); 514 expires = ktime_add_ns(expires, PSCHED_TICKS2NS(sched)); 515 ht = &q->delay_timer.timer; 516 if (hrtimer_try_to_cancel(ht) && 517 ktime_to_ns(ktime_sub(hrtimer_get_expires(ht), 518 expires)) > 0) 519 hrtimer_set_expires(ht, expires); 520 hrtimer_restart(ht); 521 cl->delayed = 1; 522 cl->xstats.overactions++; 523 return; 524 } 525 delay = 1; 526 } 527 if (q->wd_expires == 0 || q->wd_expires > delay) 528 q->wd_expires = delay; 529} 530 531/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ 532 533static void cbq_ovl_lowprio(struct cbq_class *cl) 534{ 535 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 536 537 cl->penalized = q->now + cl->penalty; 538 539 if (cl->cpriority != cl->priority2) { 540 cl->cpriority = cl->priority2; 541 q->pmask |= (1<<cl->cpriority); 542 cl->xstats.overactions++; 543 } 544 cbq_ovl_classic(cl); 545} 546 547/* TC_CBQ_OVL_DROP: penalize class by dropping */ 548 549static void cbq_ovl_drop(struct cbq_class *cl) 550{ 551 if (cl->q->ops->drop) 552 if (cl->q->ops->drop(cl->q)) 553 cl->qdisc->q.qlen--; 554 cl->xstats.overactions++; 555 cbq_ovl_classic(cl); 556} 557 558static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio, 559 psched_time_t now) 560{ 561 struct cbq_class *cl; 562 struct cbq_class *cl_prev = q->active[prio]; 563 psched_time_t sched = now; 564 565 if (cl_prev == NULL) 566 return 0; 567 568 do { 569 cl = cl_prev->next_alive; 570 if (now - cl->penalized > 0) { 571 cl_prev->next_alive = cl->next_alive; 572 cl->next_alive = NULL; 573 cl->cpriority = cl->priority; 574 cl->delayed = 0; 575 cbq_activate_class(cl); 576 577 if (cl == q->active[prio]) { 578 q->active[prio] = cl_prev; 579 if (cl == q->active[prio]) { 580 q->active[prio] = NULL; 581 return 0; 582 } 583 } 584 585 cl = cl_prev->next_alive; 586 } else if (sched - cl->penalized > 0) 587 sched = cl->penalized; 588 } while ((cl_prev = cl) != q->active[prio]); 589 590 return sched - now; 591} 592 593static enum hrtimer_restart cbq_undelay(struct hrtimer *timer) 594{ 595 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data, 596 delay_timer.timer); 597 struct Qdisc *sch = q->watchdog.qdisc; 598 psched_time_t now; 599 psched_tdiff_t delay = 0; 600 unsigned pmask; 601 602 now = psched_get_time(); 603 604 pmask = q->pmask; 605 q->pmask = 0; 606 607 while (pmask) { 608 int prio = ffz(~pmask); 609 psched_tdiff_t tmp; 610 611 pmask &= ~(1<<prio); 612 613 tmp = cbq_undelay_prio(q, prio, now); 614 if (tmp > 0) { 615 q->pmask |= 1<<prio; 616 if (tmp < delay || delay == 0) 617 delay = tmp; 618 } 619 } 620 621 if (delay) { 622 ktime_t time; 623 624 time = ktime_set(0, 0); 625 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay)); 626 tasklet_hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); 627 } 628 629 sch->flags &= ~TCQ_F_THROTTLED; 630 __netif_schedule(qdisc_root(sch)); 631 return HRTIMER_NORESTART; 632} 633 634#ifdef CONFIG_NET_CLS_ACT 635static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) 636{ 637 struct Qdisc *sch = child->__parent; 638 struct cbq_sched_data *q = qdisc_priv(sch); 639 struct cbq_class *cl = q->rx_class; 640 641 q->rx_class = NULL; 642 643 if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { 644 int ret; 645 646 cbq_mark_toplevel(q, cl); 647 648 q->rx_class = cl; 649 cl->q->__parent = sch; 650 651 ret = qdisc_enqueue(skb, cl->q); 652 if (ret == NET_XMIT_SUCCESS) { 653 sch->q.qlen++; 654 sch->bstats.packets++; 655 sch->bstats.bytes += qdisc_pkt_len(skb); 656 if (!cl->next_alive) 657 cbq_activate_class(cl); 658 return 0; 659 } 660 if (net_xmit_drop_count(ret)) 661 sch->qstats.drops++; 662 return 0; 663 } 664 665 sch->qstats.drops++; 666 return -1; 667} 668#endif 669 670/* 671 It is mission critical procedure. 672 673 We "regenerate" toplevel cutoff, if transmitting class 674 has backlog and it is not regulated. It is not part of 675 original CBQ description, but looks more reasonable. 676 Probably, it is wrong. This question needs further investigation. 677*/ 678 679static __inline__ void 680cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, 681 struct cbq_class *borrowed) 682{ 683 if (cl && q->toplevel >= borrowed->level) { 684 if (cl->q->q.qlen > 1) { 685 do { 686 if (borrowed->undertime == PSCHED_PASTPERFECT) { 687 q->toplevel = borrowed->level; 688 return; 689 } 690 } while ((borrowed=borrowed->borrow) != NULL); 691 } 692#if 0 693 /* It is not necessary now. Uncommenting it 694 will save CPU cycles, but decrease fairness. 695 */ 696 q->toplevel = TC_CBQ_MAXLEVEL; 697#endif 698 } 699} 700 701static void 702cbq_update(struct cbq_sched_data *q) 703{ 704 struct cbq_class *this = q->tx_class; 705 struct cbq_class *cl = this; 706 int len = q->tx_len; 707 708 q->tx_class = NULL; 709 710 for ( ; cl; cl = cl->share) { 711 long avgidle = cl->avgidle; 712 long idle; 713 714 cl->bstats.packets++; 715 cl->bstats.bytes += len; 716 717 /* 718 (now - last) is total time between packet right edges. 719 (last_pktlen/rate) is "virtual" busy time, so that 720 721 idle = (now - last) - last_pktlen/rate 722 */ 723 724 idle = q->now - cl->last; 725 if ((unsigned long)idle > 128*1024*1024) { 726 avgidle = cl->maxidle; 727 } else { 728 idle -= L2T(cl, len); 729 730 /* true_avgidle := (1-W)*true_avgidle + W*idle, 731 where W=2^{-ewma_log}. But cl->avgidle is scaled: 732 cl->avgidle == true_avgidle/W, 733 hence: 734 */ 735 avgidle += idle - (avgidle>>cl->ewma_log); 736 } 737 738 if (avgidle <= 0) { 739 /* Overlimit or at-limit */ 740 741 if (avgidle < cl->minidle) 742 avgidle = cl->minidle; 743 744 cl->avgidle = avgidle; 745 746 /* Calculate expected time, when this class 747 will be allowed to send. 748 It will occur, when: 749 (1-W)*true_avgidle + W*delay = 0, i.e. 750 idle = (1/W - 1)*(-true_avgidle) 751 or 752 idle = (1 - W)*(-cl->avgidle); 753 */ 754 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); 755 756 /* 757 That is not all. 758 To maintain the rate allocated to the class, 759 we add to undertime virtual clock, 760 necessary to complete transmitted packet. 761 (len/phys_bandwidth has been already passed 762 to the moment of cbq_update) 763 */ 764 765 idle -= L2T(&q->link, len); 766 idle += L2T(cl, len); 767 768 cl->undertime = q->now + idle; 769 } else { 770 /* Underlimit */ 771 772 cl->undertime = PSCHED_PASTPERFECT; 773 if (avgidle > cl->maxidle) 774 cl->avgidle = cl->maxidle; 775 else 776 cl->avgidle = avgidle; 777 } 778 cl->last = q->now; 779 } 780 781 cbq_update_toplevel(q, this, q->tx_borrowed); 782} 783 784static __inline__ struct cbq_class * 785cbq_under_limit(struct cbq_class *cl) 786{ 787 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 788 struct cbq_class *this_cl = cl; 789 790 if (cl->tparent == NULL) 791 return cl; 792 793 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) { 794 cl->delayed = 0; 795 return cl; 796 } 797 798 do { 799 /* It is very suspicious place. Now overlimit 800 action is generated for not bounded classes 801 only if link is completely congested. 802 Though it is in agree with ancestor-only paradigm, 803 it looks very stupid. Particularly, 804 it means that this chunk of code will either 805 never be called or result in strong amplification 806 of burstiness. Dangerous, silly, and, however, 807 no another solution exists. 808 */ 809 if ((cl = cl->borrow) == NULL) { 810 this_cl->qstats.overlimits++; 811 this_cl->overlimit(this_cl); 812 return NULL; 813 } 814 if (cl->level > q->toplevel) 815 return NULL; 816 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime); 817 818 cl->delayed = 0; 819 return cl; 820} 821 822static __inline__ struct sk_buff * 823cbq_dequeue_prio(struct Qdisc *sch, int prio) 824{ 825 struct cbq_sched_data *q = qdisc_priv(sch); 826 struct cbq_class *cl_tail, *cl_prev, *cl; 827 struct sk_buff *skb; 828 int deficit; 829 830 cl_tail = cl_prev = q->active[prio]; 831 cl = cl_prev->next_alive; 832 833 do { 834 deficit = 0; 835 836 /* Start round */ 837 do { 838 struct cbq_class *borrow = cl; 839 840 if (cl->q->q.qlen && 841 (borrow = cbq_under_limit(cl)) == NULL) 842 goto skip_class; 843 844 if (cl->deficit <= 0) { 845 /* Class exhausted its allotment per 846 this round. Switch to the next one. 847 */ 848 deficit = 1; 849 cl->deficit += cl->quantum; 850 goto next_class; 851 } 852 853 skb = cl->q->dequeue(cl->q); 854 855 /* Class did not give us any skb :-( 856 It could occur even if cl->q->q.qlen != 0 857 f.e. if cl->q == "tbf" 858 */ 859 if (skb == NULL) 860 goto skip_class; 861 862 cl->deficit -= qdisc_pkt_len(skb); 863 q->tx_class = cl; 864 q->tx_borrowed = borrow; 865 if (borrow != cl) { 866#ifndef CBQ_XSTATS_BORROWS_BYTES 867 borrow->xstats.borrows++; 868 cl->xstats.borrows++; 869#else 870 borrow->xstats.borrows += qdisc_pkt_len(skb); 871 cl->xstats.borrows += qdisc_pkt_len(skb); 872#endif 873 } 874 q->tx_len = qdisc_pkt_len(skb); 875 876 if (cl->deficit <= 0) { 877 q->active[prio] = cl; 878 cl = cl->next_alive; 879 cl->deficit += cl->quantum; 880 } 881 return skb; 882 883skip_class: 884 if (cl->q->q.qlen == 0 || prio != cl->cpriority) { 885 /* Class is empty or penalized. 886 Unlink it from active chain. 887 */ 888 cl_prev->next_alive = cl->next_alive; 889 cl->next_alive = NULL; 890 891 /* Did cl_tail point to it? */ 892 if (cl == cl_tail) { 893 /* Repair it! */ 894 cl_tail = cl_prev; 895 896 /* Was it the last class in this band? */ 897 if (cl == cl_tail) { 898 /* Kill the band! */ 899 q->active[prio] = NULL; 900 q->activemask &= ~(1<<prio); 901 if (cl->q->q.qlen) 902 cbq_activate_class(cl); 903 return NULL; 904 } 905 906 q->active[prio] = cl_tail; 907 } 908 if (cl->q->q.qlen) 909 cbq_activate_class(cl); 910 911 cl = cl_prev; 912 } 913 914next_class: 915 cl_prev = cl; 916 cl = cl->next_alive; 917 } while (cl_prev != cl_tail); 918 } while (deficit); 919 920 q->active[prio] = cl_prev; 921 922 return NULL; 923} 924 925static __inline__ struct sk_buff * 926cbq_dequeue_1(struct Qdisc *sch) 927{ 928 struct cbq_sched_data *q = qdisc_priv(sch); 929 struct sk_buff *skb; 930 unsigned activemask; 931 932 activemask = q->activemask&0xFF; 933 while (activemask) { 934 int prio = ffz(~activemask); 935 activemask &= ~(1<<prio); 936 skb = cbq_dequeue_prio(sch, prio); 937 if (skb) 938 return skb; 939 } 940 return NULL; 941} 942 943static struct sk_buff * 944cbq_dequeue(struct Qdisc *sch) 945{ 946 struct sk_buff *skb; 947 struct cbq_sched_data *q = qdisc_priv(sch); 948 psched_time_t now; 949 psched_tdiff_t incr; 950 951 now = psched_get_time(); 952 incr = now - q->now_rt; 953 954 if (q->tx_class) { 955 psched_tdiff_t incr2; 956 /* Time integrator. We calculate EOS time 957 by adding expected packet transmission time. 958 If real time is greater, we warp artificial clock, 959 so that: 960 961 cbq_time = max(real_time, work); 962 */ 963 incr2 = L2T(&q->link, q->tx_len); 964 q->now += incr2; 965 cbq_update(q); 966 if ((incr -= incr2) < 0) 967 incr = 0; 968 } 969 q->now += incr; 970 q->now_rt = now; 971 972 for (;;) { 973 q->wd_expires = 0; 974 975 skb = cbq_dequeue_1(sch); 976 if (skb) { 977 sch->q.qlen--; 978 sch->flags &= ~TCQ_F_THROTTLED; 979 return skb; 980 } 981 982 /* All the classes are overlimit. 983 984 It is possible, if: 985 986 1. Scheduler is empty. 987 2. Toplevel cutoff inhibited borrowing. 988 3. Root class is overlimit. 989 990 Reset 2d and 3d conditions and retry. 991 992 Note, that NS and cbq-2.0 are buggy, peeking 993 an arbitrary class is appropriate for ancestor-only 994 sharing, but not for toplevel algorithm. 995 996 Our version is better, but slower, because it requires 997 two passes, but it is unavoidable with top-level sharing. 998 */ 999 1000 if (q->toplevel == TC_CBQ_MAXLEVEL && 1001 q->link.undertime == PSCHED_PASTPERFECT) 1002 break; 1003 1004 q->toplevel = TC_CBQ_MAXLEVEL; 1005 q->link.undertime = PSCHED_PASTPERFECT; 1006 } 1007 1008 /* No packets in scheduler or nobody wants to give them to us :-( 1009 Sigh... start watchdog timer in the last case. */ 1010 1011 if (sch->q.qlen) { 1012 sch->qstats.overlimits++; 1013 if (q->wd_expires) 1014 qdisc_watchdog_schedule(&q->watchdog, 1015 now + q->wd_expires); 1016 } 1017 return NULL; 1018} 1019 1020/* CBQ class maintanance routines */ 1021 1022static void cbq_adjust_levels(struct cbq_class *this) 1023{ 1024 if (this == NULL) 1025 return; 1026 1027 do { 1028 int level = 0; 1029 struct cbq_class *cl; 1030 1031 if ((cl = this->children) != NULL) { 1032 do { 1033 if (cl->level > level) 1034 level = cl->level; 1035 } while ((cl = cl->sibling) != this->children); 1036 } 1037 this->level = level+1; 1038 } while ((this = this->tparent) != NULL); 1039} 1040 1041static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) 1042{ 1043 struct cbq_class *cl; 1044 struct hlist_node *n; 1045 unsigned int h; 1046 1047 if (q->quanta[prio] == 0) 1048 return; 1049 1050 for (h = 0; h < q->clhash.hashsize; h++) { 1051 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { 1052 /* BUGGGG... Beware! This expression suffer of 1053 arithmetic overflows! 1054 */ 1055 if (cl->priority == prio) { 1056 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ 1057 q->quanta[prio]; 1058 } 1059 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { 1060 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum); 1061 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; 1062 } 1063 } 1064 } 1065} 1066 1067static void cbq_sync_defmap(struct cbq_class *cl) 1068{ 1069 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 1070 struct cbq_class *split = cl->split; 1071 unsigned h; 1072 int i; 1073 1074 if (split == NULL) 1075 return; 1076 1077 for (i=0; i<=TC_PRIO_MAX; i++) { 1078 if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) 1079 split->defaults[i] = NULL; 1080 } 1081 1082 for (i=0; i<=TC_PRIO_MAX; i++) { 1083 int level = split->level; 1084 1085 if (split->defaults[i]) 1086 continue; 1087 1088 for (h = 0; h < q->clhash.hashsize; h++) { 1089 struct hlist_node *n; 1090 struct cbq_class *c; 1091 1092 hlist_for_each_entry(c, n, &q->clhash.hash[h], 1093 common.hnode) { 1094 if (c->split == split && c->level < level && 1095 c->defmap&(1<<i)) { 1096 split->defaults[i] = c; 1097 level = c->level; 1098 } 1099 } 1100 } 1101 } 1102} 1103 1104static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) 1105{ 1106 struct cbq_class *split = NULL; 1107 1108 if (splitid == 0) { 1109 if ((split = cl->split) == NULL) 1110 return; 1111 splitid = split->common.classid; 1112 } 1113 1114 if (split == NULL || split->common.classid != splitid) { 1115 for (split = cl->tparent; split; split = split->tparent) 1116 if (split->common.classid == splitid) 1117 break; 1118 } 1119 1120 if (split == NULL) 1121 return; 1122 1123 if (cl->split != split) { 1124 cl->defmap = 0; 1125 cbq_sync_defmap(cl); 1126 cl->split = split; 1127 cl->defmap = def&mask; 1128 } else 1129 cl->defmap = (cl->defmap&~mask)|(def&mask); 1130 1131 cbq_sync_defmap(cl); 1132} 1133 1134static void cbq_unlink_class(struct cbq_class *this) 1135{ 1136 struct cbq_class *cl, **clp; 1137 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 1138 1139 qdisc_class_hash_remove(&q->clhash, &this->common); 1140 1141 if (this->tparent) { 1142 clp=&this->sibling; 1143 cl = *clp; 1144 do { 1145 if (cl == this) { 1146 *clp = cl->sibling; 1147 break; 1148 } 1149 clp = &cl->sibling; 1150 } while ((cl = *clp) != this->sibling); 1151 1152 if (this->tparent->children == this) { 1153 this->tparent->children = this->sibling; 1154 if (this->sibling == this) 1155 this->tparent->children = NULL; 1156 } 1157 } else { 1158 WARN_ON(this->sibling != this); 1159 } 1160} 1161 1162static void cbq_link_class(struct cbq_class *this) 1163{ 1164 struct cbq_sched_data *q = qdisc_priv(this->qdisc); 1165 struct cbq_class *parent = this->tparent; 1166 1167 this->sibling = this; 1168 qdisc_class_hash_insert(&q->clhash, &this->common); 1169 1170 if (parent == NULL) 1171 return; 1172 1173 if (parent->children == NULL) { 1174 parent->children = this; 1175 } else { 1176 this->sibling = parent->children->sibling; 1177 parent->children->sibling = this; 1178 } 1179} 1180 1181static unsigned int cbq_drop(struct Qdisc* sch) 1182{ 1183 struct cbq_sched_data *q = qdisc_priv(sch); 1184 struct cbq_class *cl, *cl_head; 1185 int prio; 1186 unsigned int len; 1187 1188 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { 1189 if ((cl_head = q->active[prio]) == NULL) 1190 continue; 1191 1192 cl = cl_head; 1193 do { 1194 if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { 1195 sch->q.qlen--; 1196 if (!cl->q->q.qlen) 1197 cbq_deactivate_class(cl); 1198 return len; 1199 } 1200 } while ((cl = cl->next_alive) != cl_head); 1201 } 1202 return 0; 1203} 1204 1205static void 1206cbq_reset(struct Qdisc* sch) 1207{ 1208 struct cbq_sched_data *q = qdisc_priv(sch); 1209 struct cbq_class *cl; 1210 struct hlist_node *n; 1211 int prio; 1212 unsigned h; 1213 1214 q->activemask = 0; 1215 q->pmask = 0; 1216 q->tx_class = NULL; 1217 q->tx_borrowed = NULL; 1218 qdisc_watchdog_cancel(&q->watchdog); 1219 tasklet_hrtimer_cancel(&q->delay_timer); 1220 q->toplevel = TC_CBQ_MAXLEVEL; 1221 q->now = psched_get_time(); 1222 q->now_rt = q->now; 1223 1224 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) 1225 q->active[prio] = NULL; 1226 1227 for (h = 0; h < q->clhash.hashsize; h++) { 1228 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { 1229 qdisc_reset(cl->q); 1230 1231 cl->next_alive = NULL; 1232 cl->undertime = PSCHED_PASTPERFECT; 1233 cl->avgidle = cl->maxidle; 1234 cl->deficit = cl->quantum; 1235 cl->cpriority = cl->priority; 1236 } 1237 } 1238 sch->q.qlen = 0; 1239} 1240 1241 1242static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) 1243{ 1244 if (lss->change&TCF_CBQ_LSS_FLAGS) { 1245 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; 1246 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; 1247 } 1248 if (lss->change&TCF_CBQ_LSS_EWMA) 1249 cl->ewma_log = lss->ewma_log; 1250 if (lss->change&TCF_CBQ_LSS_AVPKT) 1251 cl->avpkt = lss->avpkt; 1252 if (lss->change&TCF_CBQ_LSS_MINIDLE) 1253 cl->minidle = -(long)lss->minidle; 1254 if (lss->change&TCF_CBQ_LSS_MAXIDLE) { 1255 cl->maxidle = lss->maxidle; 1256 cl->avgidle = lss->maxidle; 1257 } 1258 if (lss->change&TCF_CBQ_LSS_OFFTIME) 1259 cl->offtime = lss->offtime; 1260 return 0; 1261} 1262 1263static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) 1264{ 1265 q->nclasses[cl->priority]--; 1266 q->quanta[cl->priority] -= cl->weight; 1267 cbq_normalize_quanta(q, cl->priority); 1268} 1269 1270static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) 1271{ 1272 q->nclasses[cl->priority]++; 1273 q->quanta[cl->priority] += cl->weight; 1274 cbq_normalize_quanta(q, cl->priority); 1275} 1276 1277static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) 1278{ 1279 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 1280 1281 if (wrr->allot) 1282 cl->allot = wrr->allot; 1283 if (wrr->weight) 1284 cl->weight = wrr->weight; 1285 if (wrr->priority) { 1286 cl->priority = wrr->priority-1; 1287 cl->cpriority = cl->priority; 1288 if (cl->priority >= cl->priority2) 1289 cl->priority2 = TC_CBQ_MAXPRIO-1; 1290 } 1291 1292 cbq_addprio(q, cl); 1293 return 0; 1294} 1295 1296static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) 1297{ 1298 switch (ovl->strategy) { 1299 case TC_CBQ_OVL_CLASSIC: 1300 cl->overlimit = cbq_ovl_classic; 1301 break; 1302 case TC_CBQ_OVL_DELAY: 1303 cl->overlimit = cbq_ovl_delay; 1304 break; 1305 case TC_CBQ_OVL_LOWPRIO: 1306 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || 1307 ovl->priority2-1 <= cl->priority) 1308 return -EINVAL; 1309 cl->priority2 = ovl->priority2-1; 1310 cl->overlimit = cbq_ovl_lowprio; 1311 break; 1312 case TC_CBQ_OVL_DROP: 1313 cl->overlimit = cbq_ovl_drop; 1314 break; 1315 case TC_CBQ_OVL_RCLASSIC: 1316 cl->overlimit = cbq_ovl_rclassic; 1317 break; 1318 default: 1319 return -EINVAL; 1320 } 1321 cl->penalty = ovl->penalty; 1322 return 0; 1323} 1324 1325#ifdef CONFIG_NET_CLS_ACT 1326static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) 1327{ 1328 cl->police = p->police; 1329 1330 if (cl->q->handle) { 1331 if (p->police == TC_POLICE_RECLASSIFY) 1332 cl->q->reshape_fail = cbq_reshape_fail; 1333 else 1334 cl->q->reshape_fail = NULL; 1335 } 1336 return 0; 1337} 1338#endif 1339 1340static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) 1341{ 1342 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); 1343 return 0; 1344} 1345 1346static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = { 1347 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) }, 1348 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) }, 1349 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) }, 1350 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) }, 1351 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) }, 1352 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, 1353 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) }, 1354}; 1355 1356static int cbq_init(struct Qdisc *sch, struct nlattr *opt) 1357{ 1358 struct cbq_sched_data *q = qdisc_priv(sch); 1359 struct nlattr *tb[TCA_CBQ_MAX + 1]; 1360 struct tc_ratespec *r; 1361 int err; 1362 1363 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); 1364 if (err < 0) 1365 return err; 1366 1367 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL) 1368 return -EINVAL; 1369 1370 r = nla_data(tb[TCA_CBQ_RATE]); 1371 1372 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL) 1373 return -EINVAL; 1374 1375 err = qdisc_class_hash_init(&q->clhash); 1376 if (err < 0) 1377 goto put_rtab; 1378 1379 q->link.refcnt = 1; 1380 q->link.sibling = &q->link; 1381 q->link.common.classid = sch->handle; 1382 q->link.qdisc = sch; 1383 if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1384 &pfifo_qdisc_ops, 1385 sch->handle))) 1386 q->link.q = &noop_qdisc; 1387 1388 q->link.priority = TC_CBQ_MAXPRIO-1; 1389 q->link.priority2 = TC_CBQ_MAXPRIO-1; 1390 q->link.cpriority = TC_CBQ_MAXPRIO-1; 1391 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; 1392 q->link.overlimit = cbq_ovl_classic; 1393 q->link.allot = psched_mtu(qdisc_dev(sch)); 1394 q->link.quantum = q->link.allot; 1395 q->link.weight = q->link.R_tab->rate.rate; 1396 1397 q->link.ewma_log = TC_CBQ_DEF_EWMA; 1398 q->link.avpkt = q->link.allot/2; 1399 q->link.minidle = -0x7FFFFFFF; 1400 1401 qdisc_watchdog_init(&q->watchdog, sch); 1402 tasklet_hrtimer_init(&q->delay_timer, cbq_undelay, 1403 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 1404 q->delay_timer.function = cbq_undelay; 1405 q->toplevel = TC_CBQ_MAXLEVEL; 1406 q->now = psched_get_time(); 1407 q->now_rt = q->now; 1408 1409 cbq_link_class(&q->link); 1410 1411 if (tb[TCA_CBQ_LSSOPT]) 1412 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT])); 1413 1414 cbq_addprio(q, &q->link); 1415 return 0; 1416 1417put_rtab: 1418 qdisc_put_rtab(q->link.R_tab); 1419 return err; 1420} 1421 1422static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) 1423{ 1424 unsigned char *b = skb_tail_pointer(skb); 1425 1426 NLA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); 1427 return skb->len; 1428 1429nla_put_failure: 1430 nlmsg_trim(skb, b); 1431 return -1; 1432} 1433 1434static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) 1435{ 1436 unsigned char *b = skb_tail_pointer(skb); 1437 struct tc_cbq_lssopt opt; 1438 1439 opt.flags = 0; 1440 if (cl->borrow == NULL) 1441 opt.flags |= TCF_CBQ_LSS_BOUNDED; 1442 if (cl->share == NULL) 1443 opt.flags |= TCF_CBQ_LSS_ISOLATED; 1444 opt.ewma_log = cl->ewma_log; 1445 opt.level = cl->level; 1446 opt.avpkt = cl->avpkt; 1447 opt.maxidle = cl->maxidle; 1448 opt.minidle = (u32)(-cl->minidle); 1449 opt.offtime = cl->offtime; 1450 opt.change = ~0; 1451 NLA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); 1452 return skb->len; 1453 1454nla_put_failure: 1455 nlmsg_trim(skb, b); 1456 return -1; 1457} 1458 1459static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) 1460{ 1461 unsigned char *b = skb_tail_pointer(skb); 1462 struct tc_cbq_wrropt opt; 1463 1464 opt.flags = 0; 1465 opt.allot = cl->allot; 1466 opt.priority = cl->priority+1; 1467 opt.cpriority = cl->cpriority+1; 1468 opt.weight = cl->weight; 1469 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); 1470 return skb->len; 1471 1472nla_put_failure: 1473 nlmsg_trim(skb, b); 1474 return -1; 1475} 1476 1477static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) 1478{ 1479 unsigned char *b = skb_tail_pointer(skb); 1480 struct tc_cbq_ovl opt; 1481 1482 opt.strategy = cl->ovl_strategy; 1483 opt.priority2 = cl->priority2+1; 1484 opt.pad = 0; 1485 opt.penalty = cl->penalty; 1486 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); 1487 return skb->len; 1488 1489nla_put_failure: 1490 nlmsg_trim(skb, b); 1491 return -1; 1492} 1493 1494static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) 1495{ 1496 unsigned char *b = skb_tail_pointer(skb); 1497 struct tc_cbq_fopt opt; 1498 1499 if (cl->split || cl->defmap) { 1500 opt.split = cl->split ? cl->split->common.classid : 0; 1501 opt.defmap = cl->defmap; 1502 opt.defchange = ~0; 1503 NLA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); 1504 } 1505 return skb->len; 1506 1507nla_put_failure: 1508 nlmsg_trim(skb, b); 1509 return -1; 1510} 1511 1512#ifdef CONFIG_NET_CLS_ACT 1513static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) 1514{ 1515 unsigned char *b = skb_tail_pointer(skb); 1516 struct tc_cbq_police opt; 1517 1518 if (cl->police) { 1519 opt.police = cl->police; 1520 opt.__res1 = 0; 1521 opt.__res2 = 0; 1522 NLA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); 1523 } 1524 return skb->len; 1525 1526nla_put_failure: 1527 nlmsg_trim(skb, b); 1528 return -1; 1529} 1530#endif 1531 1532static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) 1533{ 1534 if (cbq_dump_lss(skb, cl) < 0 || 1535 cbq_dump_rate(skb, cl) < 0 || 1536 cbq_dump_wrr(skb, cl) < 0 || 1537 cbq_dump_ovl(skb, cl) < 0 || 1538#ifdef CONFIG_NET_CLS_ACT 1539 cbq_dump_police(skb, cl) < 0 || 1540#endif 1541 cbq_dump_fopt(skb, cl) < 0) 1542 return -1; 1543 return 0; 1544} 1545 1546static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) 1547{ 1548 struct cbq_sched_data *q = qdisc_priv(sch); 1549 struct nlattr *nest; 1550 1551 nest = nla_nest_start(skb, TCA_OPTIONS); 1552 if (nest == NULL) 1553 goto nla_put_failure; 1554 if (cbq_dump_attr(skb, &q->link) < 0) 1555 goto nla_put_failure; 1556 nla_nest_end(skb, nest); 1557 return skb->len; 1558 1559nla_put_failure: 1560 nla_nest_cancel(skb, nest); 1561 return -1; 1562} 1563 1564static int 1565cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) 1566{ 1567 struct cbq_sched_data *q = qdisc_priv(sch); 1568 1569 q->link.xstats.avgidle = q->link.avgidle; 1570 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); 1571} 1572 1573static int 1574cbq_dump_class(struct Qdisc *sch, unsigned long arg, 1575 struct sk_buff *skb, struct tcmsg *tcm) 1576{ 1577 struct cbq_class *cl = (struct cbq_class*)arg; 1578 struct nlattr *nest; 1579 1580 if (cl->tparent) 1581 tcm->tcm_parent = cl->tparent->common.classid; 1582 else 1583 tcm->tcm_parent = TC_H_ROOT; 1584 tcm->tcm_handle = cl->common.classid; 1585 tcm->tcm_info = cl->q->handle; 1586 1587 nest = nla_nest_start(skb, TCA_OPTIONS); 1588 if (nest == NULL) 1589 goto nla_put_failure; 1590 if (cbq_dump_attr(skb, cl) < 0) 1591 goto nla_put_failure; 1592 nla_nest_end(skb, nest); 1593 return skb->len; 1594 1595nla_put_failure: 1596 nla_nest_cancel(skb, nest); 1597 return -1; 1598} 1599 1600static int 1601cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, 1602 struct gnet_dump *d) 1603{ 1604 struct cbq_sched_data *q = qdisc_priv(sch); 1605 struct cbq_class *cl = (struct cbq_class*)arg; 1606 1607 cl->qstats.qlen = cl->q->q.qlen; 1608 cl->xstats.avgidle = cl->avgidle; 1609 cl->xstats.undertime = 0; 1610 1611 if (cl->undertime != PSCHED_PASTPERFECT) 1612 cl->xstats.undertime = cl->undertime - q->now; 1613 1614 if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || 1615 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || 1616 gnet_stats_copy_queue(d, &cl->qstats) < 0) 1617 return -1; 1618 1619 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); 1620} 1621 1622static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1623 struct Qdisc **old) 1624{ 1625 struct cbq_class *cl = (struct cbq_class*)arg; 1626 1627 if (cl) { 1628 if (new == NULL) { 1629 new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1630 &pfifo_qdisc_ops, 1631 cl->common.classid); 1632 if (new == NULL) 1633 return -ENOBUFS; 1634 } else { 1635#ifdef CONFIG_NET_CLS_ACT 1636 if (cl->police == TC_POLICE_RECLASSIFY) 1637 new->reshape_fail = cbq_reshape_fail; 1638#endif 1639 } 1640 sch_tree_lock(sch); 1641 *old = cl->q; 1642 cl->q = new; 1643 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen); 1644 qdisc_reset(*old); 1645 sch_tree_unlock(sch); 1646 1647 return 0; 1648 } 1649 return -ENOENT; 1650} 1651 1652static struct Qdisc * 1653cbq_leaf(struct Qdisc *sch, unsigned long arg) 1654{ 1655 struct cbq_class *cl = (struct cbq_class*)arg; 1656 1657 return cl ? cl->q : NULL; 1658} 1659 1660static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg) 1661{ 1662 struct cbq_class *cl = (struct cbq_class *)arg; 1663 1664 if (cl->q->q.qlen == 0) 1665 cbq_deactivate_class(cl); 1666} 1667 1668static unsigned long cbq_get(struct Qdisc *sch, u32 classid) 1669{ 1670 struct cbq_sched_data *q = qdisc_priv(sch); 1671 struct cbq_class *cl = cbq_class_lookup(q, classid); 1672 1673 if (cl) { 1674 cl->refcnt++; 1675 return (unsigned long)cl; 1676 } 1677 return 0; 1678} 1679 1680static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) 1681{ 1682 struct cbq_sched_data *q = qdisc_priv(sch); 1683 1684 WARN_ON(cl->filters); 1685 1686 tcf_destroy_chain(&cl->filter_list); 1687 qdisc_destroy(cl->q); 1688 qdisc_put_rtab(cl->R_tab); 1689 gen_kill_estimator(&cl->bstats, &cl->rate_est); 1690 if (cl != &q->link) 1691 kfree(cl); 1692} 1693 1694static void 1695cbq_destroy(struct Qdisc* sch) 1696{ 1697 struct cbq_sched_data *q = qdisc_priv(sch); 1698 struct hlist_node *n, *next; 1699 struct cbq_class *cl; 1700 unsigned h; 1701 1702#ifdef CONFIG_NET_CLS_ACT 1703 q->rx_class = NULL; 1704#endif 1705 /* 1706 * Filters must be destroyed first because we don't destroy the 1707 * classes from root to leafs which means that filters can still 1708 * be bound to classes which have been destroyed already. --TGR '04 1709 */ 1710 for (h = 0; h < q->clhash.hashsize; h++) { 1711 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) 1712 tcf_destroy_chain(&cl->filter_list); 1713 } 1714 for (h = 0; h < q->clhash.hashsize; h++) { 1715 hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[h], 1716 common.hnode) 1717 cbq_destroy_class(sch, cl); 1718 } 1719 qdisc_class_hash_destroy(&q->clhash); 1720} 1721 1722static void cbq_put(struct Qdisc *sch, unsigned long arg) 1723{ 1724 struct cbq_class *cl = (struct cbq_class*)arg; 1725 1726 if (--cl->refcnt == 0) { 1727#ifdef CONFIG_NET_CLS_ACT 1728 spinlock_t *root_lock = qdisc_root_sleeping_lock(sch); 1729 struct cbq_sched_data *q = qdisc_priv(sch); 1730 1731 spin_lock_bh(root_lock); 1732 if (q->rx_class == cl) 1733 q->rx_class = NULL; 1734 spin_unlock_bh(root_lock); 1735#endif 1736 1737 cbq_destroy_class(sch, cl); 1738 } 1739} 1740 1741static int 1742cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca, 1743 unsigned long *arg) 1744{ 1745 int err; 1746 struct cbq_sched_data *q = qdisc_priv(sch); 1747 struct cbq_class *cl = (struct cbq_class*)*arg; 1748 struct nlattr *opt = tca[TCA_OPTIONS]; 1749 struct nlattr *tb[TCA_CBQ_MAX + 1]; 1750 struct cbq_class *parent; 1751 struct qdisc_rate_table *rtab = NULL; 1752 1753 if (opt == NULL) 1754 return -EINVAL; 1755 1756 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); 1757 if (err < 0) 1758 return err; 1759 1760 if (cl) { 1761 /* Check parent */ 1762 if (parentid) { 1763 if (cl->tparent && 1764 cl->tparent->common.classid != parentid) 1765 return -EINVAL; 1766 if (!cl->tparent && parentid != TC_H_ROOT) 1767 return -EINVAL; 1768 } 1769 1770 if (tb[TCA_CBQ_RATE]) { 1771 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), 1772 tb[TCA_CBQ_RTAB]); 1773 if (rtab == NULL) 1774 return -EINVAL; 1775 } 1776 1777 if (tca[TCA_RATE]) { 1778 err = gen_replace_estimator(&cl->bstats, &cl->rate_est, 1779 qdisc_root_sleeping_lock(sch), 1780 tca[TCA_RATE]); 1781 if (err) { 1782 if (rtab) 1783 qdisc_put_rtab(rtab); 1784 return err; 1785 } 1786 } 1787 1788 /* Change class parameters */ 1789 sch_tree_lock(sch); 1790 1791 if (cl->next_alive != NULL) 1792 cbq_deactivate_class(cl); 1793 1794 if (rtab) { 1795 qdisc_put_rtab(cl->R_tab); 1796 cl->R_tab = rtab; 1797 } 1798 1799 if (tb[TCA_CBQ_LSSOPT]) 1800 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); 1801 1802 if (tb[TCA_CBQ_WRROPT]) { 1803 cbq_rmprio(q, cl); 1804 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); 1805 } 1806 1807 if (tb[TCA_CBQ_OVL_STRATEGY]) 1808 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); 1809 1810#ifdef CONFIG_NET_CLS_ACT 1811 if (tb[TCA_CBQ_POLICE]) 1812 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); 1813#endif 1814 1815 if (tb[TCA_CBQ_FOPT]) 1816 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); 1817 1818 if (cl->q->q.qlen) 1819 cbq_activate_class(cl); 1820 1821 sch_tree_unlock(sch); 1822 1823 return 0; 1824 } 1825 1826 if (parentid == TC_H_ROOT) 1827 return -EINVAL; 1828 1829 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL || 1830 tb[TCA_CBQ_LSSOPT] == NULL) 1831 return -EINVAL; 1832 1833 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]); 1834 if (rtab == NULL) 1835 return -EINVAL; 1836 1837 if (classid) { 1838 err = -EINVAL; 1839 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) 1840 goto failure; 1841 } else { 1842 int i; 1843 classid = TC_H_MAKE(sch->handle,0x8000); 1844 1845 for (i=0; i<0x8000; i++) { 1846 if (++q->hgenerator >= 0x8000) 1847 q->hgenerator = 1; 1848 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) 1849 break; 1850 } 1851 err = -ENOSR; 1852 if (i >= 0x8000) 1853 goto failure; 1854 classid = classid|q->hgenerator; 1855 } 1856 1857 parent = &q->link; 1858 if (parentid) { 1859 parent = cbq_class_lookup(q, parentid); 1860 err = -EINVAL; 1861 if (parent == NULL) 1862 goto failure; 1863 } 1864 1865 err = -ENOBUFS; 1866 cl = kzalloc(sizeof(*cl), GFP_KERNEL); 1867 if (cl == NULL) 1868 goto failure; 1869 1870 if (tca[TCA_RATE]) { 1871 err = gen_new_estimator(&cl->bstats, &cl->rate_est, 1872 qdisc_root_sleeping_lock(sch), 1873 tca[TCA_RATE]); 1874 if (err) { 1875 kfree(cl); 1876 goto failure; 1877 } 1878 } 1879 1880 cl->R_tab = rtab; 1881 rtab = NULL; 1882 cl->refcnt = 1; 1883 if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, 1884 &pfifo_qdisc_ops, classid))) 1885 cl->q = &noop_qdisc; 1886 cl->common.classid = classid; 1887 cl->tparent = parent; 1888 cl->qdisc = sch; 1889 cl->allot = parent->allot; 1890 cl->quantum = cl->allot; 1891 cl->weight = cl->R_tab->rate.rate; 1892 1893 sch_tree_lock(sch); 1894 cbq_link_class(cl); 1895 cl->borrow = cl->tparent; 1896 if (cl->tparent != &q->link) 1897 cl->share = cl->tparent; 1898 cbq_adjust_levels(parent); 1899 cl->minidle = -0x7FFFFFFF; 1900 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); 1901 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); 1902 if (cl->ewma_log==0) 1903 cl->ewma_log = q->link.ewma_log; 1904 if (cl->maxidle==0) 1905 cl->maxidle = q->link.maxidle; 1906 if (cl->avpkt==0) 1907 cl->avpkt = q->link.avpkt; 1908 cl->overlimit = cbq_ovl_classic; 1909 if (tb[TCA_CBQ_OVL_STRATEGY]) 1910 cbq_set_overlimit(cl, nla_data(tb[TCA_CBQ_OVL_STRATEGY])); 1911#ifdef CONFIG_NET_CLS_ACT 1912 if (tb[TCA_CBQ_POLICE]) 1913 cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE])); 1914#endif 1915 if (tb[TCA_CBQ_FOPT]) 1916 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT])); 1917 sch_tree_unlock(sch); 1918 1919 qdisc_class_hash_grow(sch, &q->clhash); 1920 1921 *arg = (unsigned long)cl; 1922 return 0; 1923 1924failure: 1925 qdisc_put_rtab(rtab); 1926 return err; 1927} 1928 1929static int cbq_delete(struct Qdisc *sch, unsigned long arg) 1930{ 1931 struct cbq_sched_data *q = qdisc_priv(sch); 1932 struct cbq_class *cl = (struct cbq_class*)arg; 1933 unsigned int qlen; 1934 1935 if (cl->filters || cl->children || cl == &q->link) 1936 return -EBUSY; 1937 1938 sch_tree_lock(sch); 1939 1940 qlen = cl->q->q.qlen; 1941 qdisc_reset(cl->q); 1942 qdisc_tree_decrease_qlen(cl->q, qlen); 1943 1944 if (cl->next_alive) 1945 cbq_deactivate_class(cl); 1946 1947 if (q->tx_borrowed == cl) 1948 q->tx_borrowed = q->tx_class; 1949 if (q->tx_class == cl) { 1950 q->tx_class = NULL; 1951 q->tx_borrowed = NULL; 1952 } 1953#ifdef CONFIG_NET_CLS_ACT 1954 if (q->rx_class == cl) 1955 q->rx_class = NULL; 1956#endif 1957 1958 cbq_unlink_class(cl); 1959 cbq_adjust_levels(cl->tparent); 1960 cl->defmap = 0; 1961 cbq_sync_defmap(cl); 1962 1963 cbq_rmprio(q, cl); 1964 sch_tree_unlock(sch); 1965 1966 BUG_ON(--cl->refcnt == 0); 1967 /* 1968 * This shouldn't happen: we "hold" one cops->get() when called 1969 * from tc_ctl_tclass; the destroy method is done from cops->put(). 1970 */ 1971 1972 return 0; 1973} 1974 1975static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) 1976{ 1977 struct cbq_sched_data *q = qdisc_priv(sch); 1978 struct cbq_class *cl = (struct cbq_class *)arg; 1979 1980 if (cl == NULL) 1981 cl = &q->link; 1982 1983 return &cl->filter_list; 1984} 1985 1986static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, 1987 u32 classid) 1988{ 1989 struct cbq_sched_data *q = qdisc_priv(sch); 1990 struct cbq_class *p = (struct cbq_class*)parent; 1991 struct cbq_class *cl = cbq_class_lookup(q, classid); 1992 1993 if (cl) { 1994 if (p && p->level <= cl->level) 1995 return 0; 1996 cl->filters++; 1997 return (unsigned long)cl; 1998 } 1999 return 0; 2000} 2001 2002static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) 2003{ 2004 struct cbq_class *cl = (struct cbq_class*)arg; 2005 2006 cl->filters--; 2007} 2008 2009static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) 2010{ 2011 struct cbq_sched_data *q = qdisc_priv(sch); 2012 struct cbq_class *cl; 2013 struct hlist_node *n; 2014 unsigned h; 2015 2016 if (arg->stop) 2017 return; 2018 2019 for (h = 0; h < q->clhash.hashsize; h++) { 2020 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { 2021 if (arg->count < arg->skip) { 2022 arg->count++; 2023 continue; 2024 } 2025 if (arg->fn(sch, (unsigned long)cl, arg) < 0) { 2026 arg->stop = 1; 2027 return; 2028 } 2029 arg->count++; 2030 } 2031 } 2032} 2033 2034static const struct Qdisc_class_ops cbq_class_ops = { 2035 .graft = cbq_graft, 2036 .leaf = cbq_leaf, 2037 .qlen_notify = cbq_qlen_notify, 2038 .get = cbq_get, 2039 .put = cbq_put, 2040 .change = cbq_change_class, 2041 .delete = cbq_delete, 2042 .walk = cbq_walk, 2043 .tcf_chain = cbq_find_tcf, 2044 .bind_tcf = cbq_bind_filter, 2045 .unbind_tcf = cbq_unbind_filter, 2046 .dump = cbq_dump_class, 2047 .dump_stats = cbq_dump_class_stats, 2048}; 2049 2050static struct Qdisc_ops cbq_qdisc_ops __read_mostly = { 2051 .next = NULL, 2052 .cl_ops = &cbq_class_ops, 2053 .id = "cbq", 2054 .priv_size = sizeof(struct cbq_sched_data), 2055 .enqueue = cbq_enqueue, 2056 .dequeue = cbq_dequeue, 2057 .peek = qdisc_peek_dequeued, 2058 .drop = cbq_drop, 2059 .init = cbq_init, 2060 .reset = cbq_reset, 2061 .destroy = cbq_destroy, 2062 .change = NULL, 2063 .dump = cbq_dump, 2064 .dump_stats = cbq_dump_stats, 2065 .owner = THIS_MODULE, 2066}; 2067 2068static int __init cbq_module_init(void) 2069{ 2070 return register_qdisc(&cbq_qdisc_ops); 2071} 2072static void __exit cbq_module_exit(void) 2073{ 2074 unregister_qdisc(&cbq_qdisc_ops); 2075} 2076module_init(cbq_module_init) 2077module_exit(cbq_module_exit) 2078MODULE_LICENSE("GPL");