Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net_sched: sfq: add optional RED on top of SFQ

Adds an optional Random Early Detection on each SFQ flow queue.

Traditional SFQ limits count of packets, while RED permits to also
control number of bytes per flow, and adds ECN capability as well.

1) We dont handle the idle time management in this RED implementation,
since each 'new flow' begins with a null qavg. We really want to address
backlogged flows.

2) if headdrop is selected, we try to ecn mark first packet instead of
currently enqueued packet. This gives faster feedback for tcp flows
compared to traditional RED [ marking the last packet in queue ]

Example of use :

tc qdisc add dev $DEV parent 1:1 handle 10: est 1sec 4sec sfq \
limit 3000 headdrop flows 512 divisor 16384 \
redflowlimit 100000 min 8000 max 60000 probability 0.20 ecn

qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
ewma 6 min 8000b max 60000b probability 0.2 ecn
prob_mark 0 prob_mark_head 4876 prob_drop 6131
forced_mark 0 forced_mark_head 0 forced_drop 0
Sent 1175211782 bytes 777537 pkt (dropped 6131, overlimits 11007
requeues 0)
rate 99483Kbit 8219pps backlog 689392b 456p requeues 0

In this test, with 64 netperf TCP_STREAM sessions, 50% using ECN enabled
flows, we can see number of packets CE marked is smaller than number of
drops (for non ECN flows)

If same test is run, without RED, we can check backlog is much bigger.

qdisc sfq 10: parent 1:1 limit 3000p quantum 1514b depth 127 headdrop
flows 512/16384 divisor 16384
Sent 1148683617 bytes 795006 pkt (dropped 0, overlimits 0 requeues 0)
rate 98429Kbit 8521pps backlog 1221290b 841p requeues 0

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Dave Taht <dave.taht@gmail.com>
Tested-by: Dave Taht <dave.taht@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Eric Dumazet and committed by
David S. Miller
ddecf0f4 72092cc4

+152 -17
+20
include/linux/pkt_sched.h
··· 162 162 unsigned flows; /* Maximal number of flows */ 163 163 }; 164 164 165 + struct tc_sfqred_stats { 166 + __u32 prob_drop; /* Early drops, below max threshold */ 167 + __u32 forced_drop; /* Early drops, after max threshold */ 168 + __u32 prob_mark; /* Marked packets, below max threshold */ 169 + __u32 forced_mark; /* Marked packets, after max threshold */ 170 + __u32 prob_mark_head; /* Marked packets, below max threshold */ 171 + __u32 forced_mark_head;/* Marked packets, after max threshold */ 172 + }; 173 + 165 174 struct tc_sfq_qopt_v1 { 166 175 struct tc_sfq_qopt v0; 167 176 unsigned int depth; /* max number of packets per flow */ 168 177 unsigned int headdrop; 178 + /* SFQRED parameters */ 179 + __u32 limit; /* HARD maximal flow queue length (bytes) */ 180 + __u32 qth_min; /* Min average length threshold (bytes) */ 181 + __u32 qth_max; /* Max average length threshold (bytes) */ 182 + unsigned char Wlog; /* log(W) */ 183 + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ 184 + unsigned char Scell_log; /* cell size for idle damping */ 185 + unsigned char flags; 186 + __u32 max_P; /* probability, high resolution */ 187 + /* SFQRED stats */ 188 + struct tc_sfqred_stats stats; 169 189 }; 170 190 171 191
+2 -1
include/net/red.h
··· 199 199 p->Scell_log = Scell_log; 200 200 p->Scell_max = (255 << Scell_log); 201 201 202 - memcpy(p->Stab, stab, sizeof(p->Stab)); 202 + if (stab) 203 + memcpy(p->Stab, stab, sizeof(p->Stab)); 203 204 } 204 205 205 206 static inline int red_is_idling(const struct red_vars *v)
+130 -16
net/sched/sch_sfq.c
··· 24 24 #include <net/netlink.h> 25 25 #include <net/pkt_sched.h> 26 26 #include <net/flow_keys.h> 27 + #include <net/red.h> 27 28 28 29 29 30 /* Stochastic Fairness Queuing algorithm. ··· 109 108 struct sfq_head dep; /* anchor in dep[] chains */ 110 109 unsigned short hash; /* hash value (index in ht[]) */ 111 110 short allot; /* credit for this slot */ 111 + 112 + unsigned int backlog; 113 + struct red_vars vars; 112 114 }; 113 115 114 116 struct sfq_sched_data { 115 117 /* frequently used fields */ 116 118 int limit; /* limit of total number of packets in this qdisc */ 117 119 unsigned int divisor; /* number of slots in hash table */ 118 - unsigned int maxflows; /* number of flows in flows array */ 119 - int headdrop; 120 - int maxdepth; /* limit of packets per flow */ 120 + u8 headdrop; 121 + u8 maxdepth; /* limit of packets per flow */ 121 122 122 123 u32 perturbation; 123 - struct tcf_proto *filter_list; 124 - sfq_index cur_depth; /* depth of longest slot */ 124 + u8 cur_depth; /* depth of longest slot */ 125 + u8 flags; 125 126 unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ 126 - struct sfq_slot *tail; /* current slot in round */ 127 + struct tcf_proto *filter_list; 127 128 sfq_index *ht; /* Hash table ('divisor' slots) */ 128 129 struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ 130 + 131 + struct red_parms *red_parms; 132 + struct tc_sfqred_stats stats; 133 + struct sfq_slot *tail; /* current slot in round */ 129 134 130 135 struct sfq_head dep[SFQ_MAX_DEPTH + 1]; 131 136 /* Linked lists of slots, indexed by depth ··· 140 133 * dep[X] : list of flows with X packets 141 134 */ 142 135 136 + unsigned int maxflows; /* number of flows in flows array */ 143 137 int perturb_period; 144 138 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ 145 139 struct timer_list perturb_timer; ··· 329 321 drop: 330 322 skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); 331 323 len = qdisc_pkt_len(skb); 324 + slot->backlog -= len; 332 325 sfq_dec(q, x); 333 326 kfree_skb(skb); 334 327 sch->q.qlen--; ··· 350 341 return 0; 351 342 } 352 343 344 + /* Is ECN parameter configured */ 345 + static int sfq_prob_mark(const struct sfq_sched_data *q) 346 + { 347 + return q->flags & TC_RED_ECN; 348 + } 349 + 350 + /* Should packets over max threshold just be marked */ 351 + static int sfq_hard_mark(const struct sfq_sched_data *q) 352 + { 353 + return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; 354 + } 355 + 356 + static int sfq_headdrop(const struct sfq_sched_data *q) 357 + { 358 + return q->headdrop; 359 + } 360 + 353 361 static int 354 362 sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 355 363 { ··· 375 349 sfq_index x, qlen; 376 350 struct sfq_slot *slot; 377 351 int uninitialized_var(ret); 352 + struct sk_buff *head; 353 + int delta; 378 354 379 355 hash = sfq_classify(skb, sch, &ret); 380 356 if (hash == 0) { ··· 396 368 q->ht[hash] = x; 397 369 slot = &q->slots[x]; 398 370 slot->hash = hash; 371 + slot->backlog = 0; /* should already be 0 anyway... */ 372 + red_set_vars(&slot->vars); 373 + goto enqueue; 374 + } 375 + if (q->red_parms) { 376 + slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms, 377 + &slot->vars, 378 + slot->backlog); 379 + switch (red_action(q->red_parms, 380 + &slot->vars, 381 + slot->vars.qavg)) { 382 + case RED_DONT_MARK: 383 + break; 384 + 385 + case RED_PROB_MARK: 386 + sch->qstats.overlimits++; 387 + if (sfq_prob_mark(q)) { 388 + /* We know we have at least one packet in queue */ 389 + if (sfq_headdrop(q) && 390 + INET_ECN_set_ce(slot->skblist_next)) { 391 + q->stats.prob_mark_head++; 392 + break; 393 + } 394 + if (INET_ECN_set_ce(skb)) { 395 + q->stats.prob_mark++; 396 + break; 397 + } 398 + } 399 + q->stats.prob_drop++; 400 + goto congestion_drop; 401 + 402 + case RED_HARD_MARK: 403 + sch->qstats.overlimits++; 404 + if (sfq_hard_mark(q)) { 405 + /* We know we have at least one packet in queue */ 406 + if (sfq_headdrop(q) && 407 + INET_ECN_set_ce(slot->skblist_next)) { 408 + q->stats.forced_mark_head++; 409 + break; 410 + } 411 + if (INET_ECN_set_ce(skb)) { 412 + q->stats.forced_mark++; 413 + break; 414 + } 415 + } 416 + q->stats.forced_drop++; 417 + goto congestion_drop; 418 + } 399 419 } 400 420 401 421 if (slot->qlen >= q->maxdepth) { 402 - struct sk_buff *head; 403 - 404 - if (!q->headdrop) 422 + congestion_drop: 423 + if (!sfq_headdrop(q)) 405 424 return qdisc_drop(skb, sch); 406 425 426 + /* We know we have at least one packet in queue */ 407 427 head = slot_dequeue_head(slot); 408 - sch->qstats.backlog -= qdisc_pkt_len(head); 428 + delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); 429 + sch->qstats.backlog -= delta; 430 + slot->backlog -= delta; 409 431 qdisc_drop(head, sch); 410 432 411 - sch->qstats.backlog += qdisc_pkt_len(skb); 412 433 slot_queue_add(slot, skb); 413 434 return NET_XMIT_CN; 414 435 } 415 436 437 + enqueue: 416 438 sch->qstats.backlog += qdisc_pkt_len(skb); 439 + slot->backlog += qdisc_pkt_len(skb); 417 440 slot_queue_add(slot, skb); 418 441 sfq_inc(q, x); 419 442 if (slot->qlen == 1) { /* The flow is new */ ··· 475 396 slot->next = q->tail->next; 476 397 q->tail->next = x; 477 398 } 399 + /* We could use a bigger initial quantum for new flows */ 478 400 slot->allot = q->scaled_quantum; 479 401 } 480 402 if (++sch->q.qlen <= q->limit) ··· 519 439 qdisc_bstats_update(sch, skb); 520 440 sch->q.qlen--; 521 441 sch->qstats.backlog -= qdisc_pkt_len(skb); 522 - 442 + slot->backlog -= qdisc_pkt_len(skb); 523 443 /* Is the slot empty? */ 524 444 if (slot->qlen == 0) { 525 445 q->ht[slot->hash] = SFQ_EMPTY_SLOT; ··· 570 490 sfq_dec(q, i); 571 491 __skb_queue_tail(&list, skb); 572 492 } 493 + slot->backlog = 0; 494 + red_set_vars(&slot->vars); 573 495 q->ht[slot->hash] = SFQ_EMPTY_SLOT; 574 496 } 575 497 q->tail = NULL; ··· 596 514 if (slot->qlen >= q->maxdepth) 597 515 goto drop; 598 516 slot_queue_add(slot, skb); 517 + if (q->red_parms) 518 + slot->vars.qavg = red_calc_qavg(q->red_parms, 519 + &slot->vars, 520 + slot->backlog); 521 + slot->backlog += qdisc_pkt_len(skb); 599 522 sfq_inc(q, x); 600 523 if (slot->qlen == 1) { /* The flow is new */ 601 524 if (q->tail == NULL) { /* It is the first flow */ ··· 639 552 struct tc_sfq_qopt *ctl = nla_data(opt); 640 553 struct tc_sfq_qopt_v1 *ctl_v1 = NULL; 641 554 unsigned int qlen; 555 + struct red_parms *p = NULL; 642 556 643 557 if (opt->nla_len < nla_attr_size(sizeof(*ctl))) 644 558 return -EINVAL; ··· 648 560 if (ctl->divisor && 649 561 (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) 650 562 return -EINVAL; 651 - 563 + if (ctl_v1 && ctl_v1->qth_min) { 564 + p = kmalloc(sizeof(*p), GFP_KERNEL); 565 + if (!p) 566 + return -ENOMEM; 567 + } 652 568 sch_tree_lock(sch); 653 569 if (ctl->quantum) { 654 570 q->quantum = ctl->quantum; ··· 668 576 if (ctl_v1) { 669 577 if (ctl_v1->depth) 670 578 q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); 579 + if (p) { 580 + swap(q->red_parms, p); 581 + red_set_parms(q->red_parms, 582 + ctl_v1->qth_min, ctl_v1->qth_max, 583 + ctl_v1->Wlog, 584 + ctl_v1->Plog, ctl_v1->Scell_log, 585 + NULL, 586 + ctl_v1->max_P); 587 + } 588 + q->flags = ctl_v1->flags; 671 589 q->headdrop = ctl_v1->headdrop; 672 590 } 673 591 if (ctl->limit) { ··· 696 594 q->perturbation = net_random(); 697 595 } 698 596 sch_tree_unlock(sch); 597 + kfree(p); 699 598 return 0; 700 599 } 701 600 ··· 728 625 del_timer_sync(&q->perturb_timer); 729 626 sfq_free(q->ht); 730 627 sfq_free(q->slots); 628 + kfree(q->red_parms); 731 629 } 732 630 733 631 static int sfq_init(struct Qdisc *sch, struct nlattr *opt) ··· 787 683 struct sfq_sched_data *q = qdisc_priv(sch); 788 684 unsigned char *b = skb_tail_pointer(skb); 789 685 struct tc_sfq_qopt_v1 opt; 686 + struct red_parms *p = q->red_parms; 790 687 791 688 memset(&opt, 0, sizeof(opt)); 792 689 opt.v0.quantum = q->quantum; ··· 797 692 opt.v0.flows = q->maxflows; 798 693 opt.depth = q->maxdepth; 799 694 opt.headdrop = q->headdrop; 695 + 696 + if (p) { 697 + opt.qth_min = p->qth_min >> p->Wlog; 698 + opt.qth_max = p->qth_max >> p->Wlog; 699 + opt.Wlog = p->Wlog; 700 + opt.Plog = p->Plog; 701 + opt.Scell_log = p->Scell_log; 702 + opt.max_P = p->max_P; 703 + } 704 + memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); 705 + opt.flags = q->flags; 800 706 801 707 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); 802 708 ··· 863 747 sfq_index idx = q->ht[cl - 1]; 864 748 struct gnet_stats_queue qs = { 0 }; 865 749 struct tc_sfq_xstats xstats = { 0 }; 866 - struct sk_buff *skb; 867 750 868 751 if (idx != SFQ_EMPTY_SLOT) { 869 752 const struct sfq_slot *slot = &q->slots[idx]; 870 753 871 754 xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; 872 755 qs.qlen = slot->qlen; 873 - slot_queue_walk(slot, skb) 874 - qs.backlog += qdisc_pkt_len(skb); 756 + qs.backlog = slot->backlog; 875 757 } 876 758 if (gnet_stats_copy_queue(d, &qs) < 0) 877 759 return -1;