Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net/sched: mqprio: allow per-TC user input of FP adminStatus

IEEE 802.1Q-2018 clause 6.7.2 Frame preemption specifies that each
packet priority can be assigned to a "frame preemption status" value of
either "express" or "preemptible". Express priorities are transmitted by
the local device through the eMAC, and preemptible priorities through
the pMAC (the concepts of eMAC and pMAC come from the 802.3 MAC Merge
layer).

The FP adminStatus is defined per packet priority, but 802.1Q clause
12.30.1.1.1 framePreemptionAdminStatus also says that:

| Priorities that all map to the same traffic class should be
| constrained to use the same value of preemption status.

It is impossible to ignore the cognitive dissonance in the standard
here, because it practically means that the FP adminStatus only takes
distinct values per traffic class, even though it is defined per
priority.

I can see no valid use case which is prevented by having the kernel take
the FP adminStatus as input per traffic class (what we do here).
In addition, this also enforces the above constraint by construction.
User space network managers which wish to expose FP adminStatus per
priority are free to do so; they must only observe the prio_tc_map of
the netdev (which presumably is also under their control, when
constructing the mqprio netlink attributes).

The reason for configuring frame preemption as a property of the Qdisc
layer is that the information about "preemptible TCs" is closest to the
place which handles the num_tc and prio_tc_map of the netdev. If the
UAPI would have been any other layer, it would be unclear what to do
with the FP information when num_tc collapses to 0. A key assumption is
that only mqprio/taprio change the num_tc and prio_tc_map of the netdev.
Not sure if that's a great assumption to make.

Having FP in tc-mqprio can be seen as an implementation of the use case
defined in 802.1Q Annex S.2 "Preemption used in isolation". There will
be a separate implementation of FP in tc-taprio, for the other use
cases.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Ferenc Fejes <fejes@inf.elte.hu>
Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Vladimir Oltean and committed by
Jakub Kicinski
f62af20b c54876cd

+160 -1
+1
include/net/pkt_sched.h
··· 172 172 u32 flags; 173 173 u64 min_rate[TC_QOPT_MAX_QUEUE]; 174 174 u64 max_rate[TC_QOPT_MAX_QUEUE]; 175 + unsigned long preemptible_tcs; 175 176 }; 176 177 177 178 struct tc_taprio_caps {
+16
include/uapi/linux/pkt_sched.h
··· 719 719 720 720 #define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) 721 721 722 + enum { 723 + TC_FP_EXPRESS = 1, 724 + TC_FP_PREEMPTIBLE = 2, 725 + }; 726 + 722 727 struct tc_mqprio_qopt { 723 728 __u8 num_tc; 724 729 __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; ··· 738 733 #define TC_MQPRIO_F_MAX_RATE 0x8 739 734 740 735 enum { 736 + TCA_MQPRIO_TC_ENTRY_UNSPEC, 737 + TCA_MQPRIO_TC_ENTRY_INDEX, /* u32 */ 738 + TCA_MQPRIO_TC_ENTRY_FP, /* u32 */ 739 + 740 + /* add new constants above here */ 741 + __TCA_MQPRIO_TC_ENTRY_CNT, 742 + TCA_MQPRIO_TC_ENTRY_MAX = (__TCA_MQPRIO_TC_ENTRY_CNT - 1) 743 + }; 744 + 745 + enum { 741 746 TCA_MQPRIO_UNSPEC, 742 747 TCA_MQPRIO_MODE, 743 748 TCA_MQPRIO_SHAPER, 744 749 TCA_MQPRIO_MIN_RATE64, 745 750 TCA_MQPRIO_MAX_RATE64, 751 + TCA_MQPRIO_TC_ENTRY, 746 752 __TCA_MQPRIO_MAX, 747 753 }; 748 754
+127 -1
net/sched/sch_mqprio.c
··· 5 5 * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com> 6 6 */ 7 7 8 + #include <linux/ethtool_netlink.h> 8 9 #include <linux/types.h> 9 10 #include <linux/slab.h> 10 11 #include <linux/kernel.h> ··· 28 27 u32 flags; 29 28 u64 min_rate[TC_QOPT_MAX_QUEUE]; 30 29 u64 max_rate[TC_QOPT_MAX_QUEUE]; 30 + u32 fp[TC_QOPT_MAX_QUEUE]; 31 31 }; 32 32 33 33 static int mqprio_enable_offload(struct Qdisc *sch, ··· 64 62 default: 65 63 return -EINVAL; 66 64 } 65 + 66 + mqprio_fp_to_offload(priv->fp, &mqprio); 67 67 68 68 err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_MQPRIO, 69 69 &mqprio); ··· 149 145 return 0; 150 146 } 151 147 148 + static const struct 149 + nla_policy mqprio_tc_entry_policy[TCA_MQPRIO_TC_ENTRY_MAX + 1] = { 150 + [TCA_MQPRIO_TC_ENTRY_INDEX] = NLA_POLICY_MAX(NLA_U32, 151 + TC_QOPT_MAX_QUEUE), 152 + [TCA_MQPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, 153 + TC_FP_EXPRESS, 154 + TC_FP_PREEMPTIBLE), 155 + }; 156 + 152 157 static const struct nla_policy mqprio_policy[TCA_MQPRIO_MAX + 1] = { 153 158 [TCA_MQPRIO_MODE] = { .len = sizeof(u16) }, 154 159 [TCA_MQPRIO_SHAPER] = { .len = sizeof(u16) }, 155 160 [TCA_MQPRIO_MIN_RATE64] = { .type = NLA_NESTED }, 156 161 [TCA_MQPRIO_MAX_RATE64] = { .type = NLA_NESTED }, 162 + [TCA_MQPRIO_TC_ENTRY] = { .type = NLA_NESTED }, 157 163 }; 164 + 165 + static int mqprio_parse_tc_entry(u32 fp[TC_QOPT_MAX_QUEUE], 166 + struct nlattr *opt, 167 + unsigned long *seen_tcs, 168 + struct netlink_ext_ack *extack) 169 + { 170 + struct nlattr *tb[TCA_MQPRIO_TC_ENTRY_MAX + 1]; 171 + int err, tc; 172 + 173 + err = nla_parse_nested(tb, TCA_MQPRIO_TC_ENTRY_MAX, opt, 174 + mqprio_tc_entry_policy, extack); 175 + if (err < 0) 176 + return err; 177 + 178 + if (NL_REQ_ATTR_CHECK(extack, opt, tb, TCA_MQPRIO_TC_ENTRY_INDEX)) { 179 + NL_SET_ERR_MSG(extack, "TC entry index missing"); 180 + return -EINVAL; 181 + } 182 + 183 + tc = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_INDEX]); 184 + if (*seen_tcs & BIT(tc)) { 185 + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_MQPRIO_TC_ENTRY_INDEX], 186 + "Duplicate tc entry"); 187 + return -EINVAL; 188 + } 189 + 190 + *seen_tcs |= BIT(tc); 191 + 192 + if (tb[TCA_MQPRIO_TC_ENTRY_FP]) 193 + fp[tc] = nla_get_u32(tb[TCA_MQPRIO_TC_ENTRY_FP]); 194 + 195 + return 0; 196 + } 197 + 198 + static int mqprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *nlattr_opt, 199 + int nlattr_opt_len, 200 + struct netlink_ext_ack *extack) 201 + { 202 + struct mqprio_sched *priv = qdisc_priv(sch); 203 + struct net_device *dev = qdisc_dev(sch); 204 + bool have_preemption = false; 205 + unsigned long seen_tcs = 0; 206 + u32 fp[TC_QOPT_MAX_QUEUE]; 207 + struct nlattr *n; 208 + int tc, rem; 209 + int err = 0; 210 + 211 + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) 212 + fp[tc] = priv->fp[tc]; 213 + 214 + nla_for_each_attr(n, nlattr_opt, nlattr_opt_len, rem) { 215 + if (nla_type(n) != TCA_MQPRIO_TC_ENTRY) 216 + continue; 217 + 218 + err = mqprio_parse_tc_entry(fp, n, &seen_tcs, extack); 219 + if (err) 220 + goto out; 221 + } 222 + 223 + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { 224 + priv->fp[tc] = fp[tc]; 225 + if (fp[tc] == TC_FP_PREEMPTIBLE) 226 + have_preemption = true; 227 + } 228 + 229 + if (have_preemption && !ethtool_dev_mm_supported(dev)) { 230 + NL_SET_ERR_MSG(extack, "Device does not support preemption"); 231 + return -EOPNOTSUPP; 232 + } 233 + out: 234 + return err; 235 + } 158 236 159 237 /* Parse the other netlink attributes that represent the payload of 160 238 * TCA_OPTIONS, which are appended right after struct tc_mqprio_qopt. ··· 320 234 priv->flags |= TC_MQPRIO_F_MAX_RATE; 321 235 } 322 236 237 + if (tb[TCA_MQPRIO_TC_ENTRY]) { 238 + err = mqprio_parse_tc_entries(sch, nlattr_opt, nlattr_opt_len, 239 + extack); 240 + if (err) 241 + return err; 242 + } 243 + 323 244 return 0; 324 245 } 325 246 ··· 340 247 int i, err = -EOPNOTSUPP; 341 248 struct tc_mqprio_qopt *qopt = NULL; 342 249 struct tc_mqprio_caps caps; 343 - int len; 250 + int len, tc; 344 251 345 252 BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); 346 253 BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); ··· 357 264 358 265 if (!opt || nla_len(opt) < sizeof(*qopt)) 359 266 return -EINVAL; 267 + 268 + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) 269 + priv->fp[tc] = TC_FP_EXPRESS; 360 270 361 271 qdisc_offload_query_caps(dev, TC_SETUP_QDISC_MQPRIO, 362 272 &caps, sizeof(caps)); ··· 511 415 return -1; 512 416 } 513 417 418 + static int mqprio_dump_tc_entries(struct mqprio_sched *priv, 419 + struct sk_buff *skb) 420 + { 421 + struct nlattr *n; 422 + int tc; 423 + 424 + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { 425 + n = nla_nest_start(skb, TCA_MQPRIO_TC_ENTRY); 426 + if (!n) 427 + return -EMSGSIZE; 428 + 429 + if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_INDEX, tc)) 430 + goto nla_put_failure; 431 + 432 + if (nla_put_u32(skb, TCA_MQPRIO_TC_ENTRY_FP, priv->fp[tc])) 433 + goto nla_put_failure; 434 + 435 + nla_nest_end(skb, n); 436 + } 437 + 438 + return 0; 439 + 440 + nla_put_failure: 441 + nla_nest_cancel(skb, n); 442 + return -EMSGSIZE; 443 + } 444 + 514 445 static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) 515 446 { 516 447 struct net_device *dev = qdisc_dev(sch); ··· 586 463 if ((priv->flags & TC_MQPRIO_F_MIN_RATE || 587 464 priv->flags & TC_MQPRIO_F_MAX_RATE) && 588 465 (dump_rates(priv, &opt, skb) != 0)) 466 + goto nla_put_failure; 467 + 468 + if (mqprio_dump_tc_entries(priv, skb)) 589 469 goto nla_put_failure; 590 470 591 471 return nla_nest_end(skb, nla);
+14
net/sched/sch_mqprio_lib.c
··· 114 114 } 115 115 EXPORT_SYMBOL_GPL(mqprio_qopt_reconstruct); 116 116 117 + void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE], 118 + struct tc_mqprio_qopt_offload *mqprio) 119 + { 120 + unsigned long preemptible_tcs = 0; 121 + int tc; 122 + 123 + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) 124 + if (fp[tc] == TC_FP_PREEMPTIBLE) 125 + preemptible_tcs |= BIT(tc); 126 + 127 + mqprio->preemptible_tcs = preemptible_tcs; 128 + } 129 + EXPORT_SYMBOL_GPL(mqprio_fp_to_offload); 130 + 117 131 MODULE_LICENSE("GPL");
+2
net/sched/sch_mqprio_lib.h
··· 14 14 struct netlink_ext_ack *extack); 15 15 void mqprio_qopt_reconstruct(struct net_device *dev, 16 16 struct tc_mqprio_qopt *qopt); 17 + void mqprio_fp_to_offload(u32 fp[TC_QOPT_MAX_QUEUE], 18 + struct tc_mqprio_qopt_offload *mqprio); 17 19 18 20 #endif