Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

devlink: Extend devlink rate API with traffic classes bandwidth management

Introduce support for specifying relative bandwidth shares between
traffic classes (TC) in the devlink-rate API. This new option allows
users to allocate bandwidth across multiple traffic classes in a
single command.

This feature provides a more granular control over traffic management,
especially for scenarios requiring Enhanced Transmission Selection.

Users can now define a relative bandwidth share for each traffic class.
For example, assigning share values of 20 to TC0 (TCP/UDP) and 80 to TC5
(RoCE) will result in TC0 receiving 20% and TC5 receiving 80% of the
total bandwidth. The actual percentage each class receives depends on
the ratio of its share value to the sum of all shares.

Example:
DEV=pci/0000:08:00.0

$ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \
tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0

$ devlink port function rate set $DEV/vfs_group \
tc-bw 0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0

Example usage with ynl:

./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
--do rate-set --json '{
"bus-name": "pci",
"dev-name": "0000:08:00.0",
"port-index": 1,
"rate-tc-bws": [
{"rate-tc-index": 0, "rate-tc-bw": 50},
{"rate-tc-index": 1, "rate-tc-bw": 50},
{"rate-tc-index": 2, "rate-tc-bw": 0},
{"rate-tc-index": 3, "rate-tc-bw": 0},
{"rate-tc-index": 4, "rate-tc-bw": 0},
{"rate-tc-index": 5, "rate-tc-bw": 0},
{"rate-tc-index": 6, "rate-tc-bw": 0},
{"rate-tc-index": 7, "rate-tc-bw": 0}
]
}'

./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \
--do rate-get --json '{
"bus-name": "pci",
"dev-name": "0000:08:00.0",
"port-index": 1
}'

output for rate-get:
{'bus-name': 'pci',
'dev-name': '0000:08:00.0',
'port-index': 1,
'rate-tc-bws': [{'rate-tc-bw': 50, 'rate-tc-index': 0},
{'rate-tc-bw': 50, 'rate-tc-index': 1},
{'rate-tc-bw': 0, 'rate-tc-index': 2},
{'rate-tc-bw': 0, 'rate-tc-index': 3},
{'rate-tc-bw': 0, 'rate-tc-index': 4},
{'rate-tc-bw': 0, 'rate-tc-index': 5},
{'rate-tc-bw': 0, 'rate-tc-index': 6},
{'rate-tc-bw': 0, 'rate-tc-index': 7}],
'rate-tx-max': 0,
'rate-tx-priority': 0,
'rate-tx-share': 0,
'rate-tx-weight': 0,
'rate-type': 'leaf'}

Signed-off-by: Carolina Jubran <cjubran@nvidia.com>
Reviewed-by: Cosmin Ratiu <cratiu@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Signed-off-by: Mark Bloch <mbloch@nvidia.com>
Link: https://patch.msgid.link/20250629142138.361537-3-mbloch@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Carolina Jubran and committed by
Jakub Kicinski
566e8f10 42401c42

+195 -5
+31 -1
Documentation/netlink/specs/devlink.yaml
··· 224 224 value: 10 225 225 - 226 226 name: binary 227 + - 228 + name: rate-tc-index-max 229 + type: const 230 + value: 7 227 231 228 232 attribute-sets: 229 233 - ··· 848 844 - 849 845 name: region-direct 850 846 type: flag 851 - 847 + - 848 + name: rate-tc-bws 849 + type: nest 850 + multi-attr: true 851 + nested-attributes: dl-rate-tc-bws 852 + - 853 + name: rate-tc-index 854 + type: u8 855 + checks: 856 + max: rate-tc-index-max 857 + - 858 + name: rate-tc-bw 859 + type: u32 860 + doc: | 861 + Specifies the bandwidth share assigned to the Traffic Class. 862 + The bandwidth for the traffic class is determined 863 + in proportion to the sum of the shares of all configured classes. 852 864 - 853 865 name: dl-dev-stats 854 866 subset-of: devlink ··· 1269 1249 - 1270 1250 name: flash 1271 1251 type: flag 1252 + - 1253 + name: dl-rate-tc-bws 1254 + subset-of: devlink 1255 + attributes: 1256 + - 1257 + name: rate-tc-index 1258 + - 1259 + name: rate-tc-bw 1272 1260 1273 1261 operations: 1274 1262 enum-model: directional ··· 2204 2176 - rate-tx-priority 2205 2177 - rate-tx-weight 2206 2178 - rate-parent-node-name 2179 + - rate-tc-bws 2207 2180 2208 2181 - 2209 2182 name: rate-new ··· 2225 2196 - rate-tx-priority 2226 2197 - rate-tx-weight 2227 2198 - rate-parent-node-name 2199 + - rate-tc-bws 2228 2200 2229 2201 - 2230 2202 name: rate-del
+8
include/net/devlink.h
··· 118 118 119 119 u32 tx_priority; 120 120 u32 tx_weight; 121 + 122 + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; 121 123 }; 122 124 123 125 struct devlink_port { ··· 1488 1486 u32 tx_priority, struct netlink_ext_ack *extack); 1489 1487 int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, 1490 1488 u32 tx_weight, struct netlink_ext_ack *extack); 1489 + int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, 1490 + void *priv, u32 *tc_bw, 1491 + struct netlink_ext_ack *extack); 1491 1492 int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, 1492 1493 u64 tx_share, struct netlink_ext_ack *extack); 1493 1494 int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, ··· 1499 1494 u32 tx_priority, struct netlink_ext_ack *extack); 1500 1495 int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, 1501 1496 u32 tx_weight, struct netlink_ext_ack *extack); 1497 + int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, 1498 + void *priv, u32 *tc_bw, 1499 + struct netlink_ext_ack *extack); 1502 1500 int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, 1503 1501 struct netlink_ext_ack *extack); 1504 1502 int (*rate_node_del)(struct devlink_rate *rate_node, void *priv,
+9
include/uapi/linux/devlink.h
··· 221 221 */ 222 222 }; 223 223 224 + /* IEEE 802.1Qaz standard supported values. */ 225 + 226 + #define DEVLINK_RATE_TCS_MAX 8 227 + #define DEVLINK_RATE_TC_INDEX_MAX (DEVLINK_RATE_TCS_MAX - 1) 228 + 224 229 enum devlink_rate_type { 225 230 DEVLINK_RATE_TYPE_LEAF, 226 231 DEVLINK_RATE_TYPE_NODE, ··· 633 628 DEVLINK_ATTR_RATE_TX_WEIGHT, /* u32 */ 634 629 635 630 DEVLINK_ATTR_REGION_DIRECT, /* flag */ 631 + 632 + DEVLINK_ATTR_RATE_TC_BWS, /* nested */ 633 + DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ 634 + DEVLINK_ATTR_RATE_TC_BW, /* u32 */ 636 635 637 636 /* Add new attributes above here, update the spec in 638 637 * Documentation/netlink/specs/devlink.yaml and re-generate
+127
net/devlink/rate.c
··· 80 80 return ERR_PTR(-EINVAL); 81 81 } 82 82 83 + static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) 84 + { 85 + struct nlattr *nla_tc_bw; 86 + int i; 87 + 88 + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { 89 + nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS); 90 + if (!nla_tc_bw) 91 + return -EMSGSIZE; 92 + 93 + if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) || 94 + nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i])) 95 + goto nla_put_failure; 96 + 97 + nla_nest_end(msg, nla_tc_bw); 98 + } 99 + return 0; 100 + 101 + nla_put_failure: 102 + nla_nest_cancel(msg, nla_tc_bw); 103 + return -EMSGSIZE; 104 + } 105 + 83 106 static int devlink_nl_rate_fill(struct sk_buff *msg, 84 107 struct devlink_rate *devlink_rate, 85 108 enum devlink_command cmd, u32 portid, u32 seq, ··· 151 128 if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME, 152 129 devlink_rate->parent->name)) 153 130 goto nla_put_failure; 131 + 132 + if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw)) 133 + goto nla_put_failure; 154 134 155 135 genlmsg_end(msg, hdr); 156 136 return 0; ··· 342 316 return 0; 343 317 } 344 318 319 + static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, 320 + unsigned long *bitmap, 321 + struct netlink_ext_ack *extack) 322 + { 323 + struct nlattr *tb[DEVLINK_ATTR_MAX + 1]; 324 + u8 tc_index; 325 + int err; 326 + 327 + err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest, 328 + devlink_dl_rate_tc_bws_nl_policy, extack); 329 + if (err) 330 + return err; 331 + 332 + if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) { 333 + NL_SET_ERR_ATTR_MISS(extack, parent_nest, 334 + DEVLINK_ATTR_RATE_TC_INDEX); 335 + return -EINVAL; 336 + } 337 + 338 + tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]); 339 + 340 + if (!tb[DEVLINK_ATTR_RATE_TC_BW]) { 341 + NL_SET_ERR_ATTR_MISS(extack, parent_nest, 342 + DEVLINK_ATTR_RATE_TC_BW); 343 + return -EINVAL; 344 + } 345 + 346 + if (test_and_set_bit(tc_index, bitmap)) { 347 + NL_SET_ERR_MSG_FMT(extack, 348 + "Duplicate traffic class index specified (%u)", 349 + tc_index); 350 + return -EINVAL; 351 + } 352 + 353 + tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]); 354 + 355 + return 0; 356 + } 357 + 358 + static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate, 359 + struct genl_info *info) 360 + { 361 + DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {}; 362 + struct devlink *devlink = devlink_rate->devlink; 363 + const struct devlink_ops *ops = devlink->ops; 364 + u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {}; 365 + int rem, err = -EOPNOTSUPP, i; 366 + struct nlattr *attr; 367 + 368 + nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr, 369 + GENL_HDRLEN, rem) { 370 + err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap, 371 + info->extack); 372 + if (err) 373 + return err; 374 + } 375 + 376 + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { 377 + if (!test_bit(i, bitmap)) { 378 + NL_SET_ERR_MSG_FMT(info->extack, 379 + "Bandwidth values must be specified for all %u traffic classes", 380 + DEVLINK_RATE_TCS_MAX); 381 + return -EINVAL; 382 + } 383 + } 384 + 385 + if (devlink_rate_is_leaf(devlink_rate)) 386 + err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, 387 + tc_bw, info->extack); 388 + else if (devlink_rate_is_node(devlink_rate)) 389 + err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, 390 + tc_bw, info->extack); 391 + 392 + if (err) 393 + return err; 394 + 395 + memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); 396 + 397 + return 0; 398 + } 399 + 345 400 static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, 346 401 const struct devlink_ops *ops, 347 402 struct genl_info *info) ··· 495 388 return err; 496 389 } 497 390 391 + if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) { 392 + err = devlink_nl_rate_tc_bw_set(devlink_rate, info); 393 + if (err) 394 + return err; 395 + } 396 + 498 397 return 0; 499 398 } 500 399 ··· 536 423 "TX weight set isn't supported for the leafs"); 537 424 return false; 538 425 } 426 + if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && 427 + !ops->rate_leaf_tc_bw_set) { 428 + NL_SET_ERR_MSG_ATTR(info->extack, 429 + attrs[DEVLINK_ATTR_RATE_TC_BWS], 430 + "TC bandwidth set isn't supported for the leafs"); 431 + return false; 432 + } 539 433 } else if (type == DEVLINK_RATE_TYPE_NODE) { 540 434 if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { 541 435 NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); ··· 567 447 NL_SET_ERR_MSG_ATTR(info->extack, 568 448 attrs[DEVLINK_ATTR_RATE_TX_WEIGHT], 569 449 "TX weight set isn't supported for the nodes"); 450 + return false; 451 + } 452 + if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && 453 + !ops->rate_node_tc_bw_set) { 454 + NL_SET_ERR_MSG_ATTR(info->extack, 455 + attrs[DEVLINK_ATTR_RATE_TC_BWS], 456 + "TC bandwidth set isn't supported for the nodes"); 570 457 return false; 571 458 } 572 459 } else {