Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net/sched: taprio: give higher priority to higher TCs in software dequeue mode

Current taprio software implementation is haunted by the shadow of the
igb/igc hardware model. It iterates over child qdiscs in increasing
order of TXQ index, therefore giving higher xmit priority to TXQ 0 and
lower to TXQ N. According to discussions with Vinicius, that is the
default (perhaps even unchangeable) prioritization scheme used for the
NICs that taprio was first written for (igb, igc), and we have a case of
two bugs canceling out, resulting in a functional setup on igb/igc, but
a less sane one on other NICs.

To the best of my understanding, taprio should prioritize based on the
traffic class, so it should really dequeue starting with the highest
traffic class and going down from there. We get to the TXQ using the
tc_to_txq[] netdev property.

TXQs within the same TC have the same (strict) priority, so we should
pick from them as fairly as we can. We can achieve that by implementing
something very similar to q->curband from multiq_dequeue().

Since igb/igc really do have TXQ 0 of higher hardware priority than
TXQ 1 etc, we need to preserve the behavior for them as well. We really
have no choice, because in txtime-assist mode, taprio is essentially a
software scheduler towards offloaded child tc-etf qdiscs, so the TXQ
selection really does matter (not all igb TXQs support ETF/SO_TXTIME,
says Kurt Kanzenbach).

To preserve the behavior, we need a capability bit so that taprio can
determine if it's running on igb/igc, or on something else. Because igb
doesn't offload taprio at all, we can't piggyback on the
qdisc_offload_query_caps() call from taprio_enable_offload(), but
instead we need a separate call which is also made for software
scheduling.

Introduce two static keys to minimize the performance penalty on systems
which only have igb/igc NICs, and on systems which only have other NICs.
For mixed systems, taprio will have to dynamically check whether to
dequeue using one prioritization algorithm or using the other.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Vladimir Oltean and committed by
David S. Miller
2f530df7 4c229427

+143 -11
+18
drivers/net/ethernet/intel/igb/igb_main.c
··· 2810 2810 return 0; 2811 2811 } 2812 2812 2813 + static int igb_tc_query_caps(struct igb_adapter *adapter, 2814 + struct tc_query_caps_base *base) 2815 + { 2816 + switch (base->type) { 2817 + case TC_SETUP_QDISC_TAPRIO: { 2818 + struct tc_taprio_caps *caps = base->caps; 2819 + 2820 + caps->broken_mqprio = true; 2821 + 2822 + return 0; 2823 + } 2824 + default: 2825 + return -EOPNOTSUPP; 2826 + } 2827 + } 2828 + 2813 2829 static LIST_HEAD(igb_block_cb_list); 2814 2830 2815 2831 static int igb_setup_tc(struct net_device *dev, enum tc_setup_type type, ··· 2834 2818 struct igb_adapter *adapter = netdev_priv(dev); 2835 2819 2836 2820 switch (type) { 2821 + case TC_QUERY_CAPS: 2822 + return igb_tc_query_caps(adapter, type_data); 2837 2823 case TC_SETUP_QDISC_CBS: 2838 2824 return igb_offload_cbs(adapter, type_data); 2839 2825 case TC_SETUP_BLOCK:
+3 -3
drivers/net/ethernet/intel/igc/igc_main.c
··· 6214 6214 case TC_SETUP_QDISC_TAPRIO: { 6215 6215 struct tc_taprio_caps *caps = base->caps; 6216 6216 6217 - if (hw->mac.type != igc_i225) 6218 - return -EOPNOTSUPP; 6217 + caps->broken_mqprio = true; 6219 6218 6220 - caps->gate_mask_per_txq = true; 6219 + if (hw->mac.type == igc_i225) 6220 + caps->gate_mask_per_txq = true; 6221 6221 6222 6222 return 0; 6223 6223 }
+5
include/net/pkt_sched.h
··· 177 177 struct tc_taprio_caps { 178 178 bool supports_queue_max_sdu:1; 179 179 bool gate_mask_per_txq:1; 180 + /* Device expects lower TXQ numbers to have higher priority over higher 181 + * TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS, 182 + * INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE. 183 + */ 184 + bool broken_mqprio:1; 180 185 }; 181 186 182 187 struct tc_taprio_sched_entry {
+117 -8
net/sched/sch_taprio.c
··· 29 29 #include "sch_mqprio_lib.h" 30 30 31 31 static LIST_HEAD(taprio_list); 32 + static struct static_key_false taprio_have_broken_mqprio; 33 + static struct static_key_false taprio_have_working_mqprio; 32 34 33 35 #define TAPRIO_ALL_GATES_OPEN -1 34 36 ··· 71 69 enum tk_offsets tk_offset; 72 70 int clockid; 73 71 bool offloaded; 72 + bool detected_mqprio; 73 + bool broken_mqprio; 74 74 atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ 75 75 * speeds it's sub-nanoseconds per byte 76 76 */ ··· 84 80 struct sched_gate_list __rcu *admin_sched; 85 81 struct hrtimer advance_timer; 86 82 struct list_head taprio_list; 83 + int cur_txq[TC_MAX_QUEUE]; 87 84 u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ 88 85 u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */ 89 86 u32 txtime_delay; ··· 573 568 return skb; 574 569 } 575 570 571 + static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) 572 + { 573 + int offset = dev->tc_to_txq[tc].offset; 574 + int count = dev->tc_to_txq[tc].count; 575 + 576 + (*txq)++; 577 + if (*txq == offset + count) 578 + *txq = offset; 579 + } 580 + 581 + /* Prioritize higher traffic classes, and select among TXQs belonging to the 582 + * same TC using round robin 583 + */ 584 + static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, 585 + struct sched_entry *entry, 586 + u32 gate_mask) 587 + { 588 + struct taprio_sched *q = qdisc_priv(sch); 589 + struct net_device *dev = qdisc_dev(sch); 590 + int num_tc = netdev_get_num_tc(dev); 591 + struct sk_buff *skb; 592 + int tc; 593 + 594 + for (tc = num_tc - 1; tc >= 0; tc--) { 595 + int first_txq = q->cur_txq[tc]; 596 + 597 + if (!(gate_mask & BIT(tc))) 598 + continue; 599 + 600 + do { 601 + skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], 602 + entry, gate_mask); 603 + 604 + taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); 605 + 606 + if (skb) 607 + return skb; 608 + } while (q->cur_txq[tc] != first_txq); 609 + } 610 + 611 + return NULL; 612 + } 613 + 614 + /* Broken way of prioritizing smaller TXQ indices and ignoring the traffic 615 + * class other than to determine whether the gate is open or not 616 + */ 617 + static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, 618 + struct sched_entry *entry, 619 + u32 gate_mask) 620 + { 621 + struct net_device *dev = qdisc_dev(sch); 622 + struct sk_buff *skb; 623 + int i; 624 + 625 + for (i = 0; i < dev->num_tx_queues; i++) { 626 + skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); 627 + if (skb) 628 + return skb; 629 + } 630 + 631 + return NULL; 632 + } 633 + 576 634 /* Will not be called in the full offload case, since the TX queues are 577 635 * attached to the Qdisc created using qdisc_create_dflt() 578 636 */ 579 637 static struct sk_buff *taprio_dequeue(struct Qdisc *sch) 580 638 { 581 639 struct taprio_sched *q = qdisc_priv(sch); 582 - struct net_device *dev = qdisc_dev(sch); 583 640 struct sk_buff *skb = NULL; 584 641 struct sched_entry *entry; 585 642 u32 gate_mask; 586 - int i; 587 643 588 644 rcu_read_lock(); 589 645 entry = rcu_dereference(q->current_entry); ··· 654 588 * "AdminGateStates" 655 589 */ 656 590 gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; 657 - 658 591 if (!gate_mask) 659 592 goto done; 660 593 661 - for (i = 0; i < dev->num_tx_queues; i++) { 662 - skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); 663 - if (skb) 664 - goto done; 594 + if (static_branch_unlikely(&taprio_have_broken_mqprio) && 595 + !static_branch_likely(&taprio_have_working_mqprio)) { 596 + /* Single NIC kind which is broken */ 597 + skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); 598 + } else if (static_branch_likely(&taprio_have_working_mqprio) && 599 + !static_branch_unlikely(&taprio_have_broken_mqprio)) { 600 + /* Single NIC kind which prioritizes properly */ 601 + skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); 602 + } else { 603 + /* Mixed NIC kinds present in system, need dynamic testing */ 604 + if (q->broken_mqprio) 605 + skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); 606 + else 607 + skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); 665 608 } 666 609 667 610 done: ··· 1232 1157 offload->num_entries = i; 1233 1158 } 1234 1159 1160 + static void taprio_detect_broken_mqprio(struct taprio_sched *q) 1161 + { 1162 + struct net_device *dev = qdisc_dev(q->root); 1163 + struct tc_taprio_caps caps; 1164 + 1165 + qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, 1166 + &caps, sizeof(caps)); 1167 + 1168 + q->broken_mqprio = caps.broken_mqprio; 1169 + if (q->broken_mqprio) 1170 + static_branch_inc(&taprio_have_broken_mqprio); 1171 + else 1172 + static_branch_inc(&taprio_have_working_mqprio); 1173 + 1174 + q->detected_mqprio = true; 1175 + } 1176 + 1177 + static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) 1178 + { 1179 + if (!q->detected_mqprio) 1180 + return; 1181 + 1182 + if (q->broken_mqprio) 1183 + static_branch_dec(&taprio_have_broken_mqprio); 1184 + else 1185 + static_branch_dec(&taprio_have_working_mqprio); 1186 + } 1187 + 1235 1188 static int taprio_enable_offload(struct net_device *dev, 1236 1189 struct taprio_sched *q, 1237 1190 struct sched_gate_list *sched, ··· 1641 1538 err = netdev_set_num_tc(dev, mqprio->num_tc); 1642 1539 if (err) 1643 1540 goto free_sched; 1644 - for (i = 0; i < mqprio->num_tc; i++) 1541 + for (i = 0; i < mqprio->num_tc; i++) { 1645 1542 netdev_set_tc_queue(dev, i, 1646 1543 mqprio->count[i], 1647 1544 mqprio->offset[i]); 1545 + q->cur_txq[i] = mqprio->offset[i]; 1546 + } 1648 1547 1649 1548 /* Always use supplied priority mappings */ 1650 1549 for (i = 0; i <= TC_BITMASK; i++) ··· 1781 1676 1782 1677 if (admin) 1783 1678 call_rcu(&admin->rcu, taprio_free_sched_cb); 1679 + 1680 + taprio_cleanup_broken_mqprio(q); 1784 1681 } 1785 1682 1786 1683 static int taprio_init(struct Qdisc *sch, struct nlattr *opt, ··· 1846 1739 1847 1740 q->qdiscs[i] = qdisc; 1848 1741 } 1742 + 1743 + taprio_detect_broken_mqprio(q); 1849 1744 1850 1745 return taprio_change(sch, opt, extack); 1851 1746 }