Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

net: sched: support hash selecting tx queue

This patch allows users to pick queue_mapping, range
from A to B. Then we can load balance packets from A
to B tx queue. The range is an unsigned 16bit value
in decimal format.

$ tc filter ... action skbedit queue_mapping skbhash A B

"skbedit queue_mapping QUEUE_MAPPING" (from "man 8 tc-skbedit")
is enhanced with flags: SKBEDIT_F_TXQ_SKBHASH

+----+ +----+ +----+
| P1 | | P2 | | Pn |
+----+ +----+ +----+
| | |
+-----------+-----------+
|
| clsact/skbedit
| MQ
v
+-----------+-----------+
| q0 | qn | qm
v v v
HTB/FQ FIFO ... FIFO

For example:
If P1 sends out packets to different Pods on other host, and
we want distribute flows from qn - qm. Then we can use skb->hash
as hash.

setup commands:
$ NETDEV=eth0
$ ip netns add n1
$ ip link add ipv1 link $NETDEV type ipvlan mode l2
$ ip link set ipv1 netns n1
$ ip netns exec n1 ifconfig ipv1 2.2.2.100/24 up

$ tc qdisc add dev $NETDEV clsact
$ tc filter add dev $NETDEV egress protocol ip prio 1 \
flower skip_hw src_ip 2.2.2.100 action skbedit queue_mapping skbhash 2 6
$ tc qdisc add dev $NETDEV handle 1: root mq
$ tc qdisc add dev $NETDEV parent 1:1 handle 2: htb
$ tc class add dev $NETDEV parent 2: classid 2:1 htb rate 100kbit
$ tc class add dev $NETDEV parent 2: classid 2:2 htb rate 200kbit
$ tc qdisc add dev $NETDEV parent 1:2 tbf rate 100mbit burst 100mb latency 1
$ tc qdisc add dev $NETDEV parent 1:3 pfifo
$ tc qdisc add dev $NETDEV parent 1:4 pfifo
$ tc qdisc add dev $NETDEV parent 1:5 pfifo
$ tc qdisc add dev $NETDEV parent 1:6 pfifo
$ tc qdisc add dev $NETDEV parent 1:7 pfifo

$ ip netns exec n1 iperf3 -c 2.2.2.1 -i 1 -t 10 -P 10

pick txqueue from 2 - 6:
$ ethtool -S $NETDEV | grep -i tx_queue_[0-9]_bytes
tx_queue_0_bytes: 42
tx_queue_1_bytes: 0
tx_queue_2_bytes: 11442586444
tx_queue_3_bytes: 7383615334
tx_queue_4_bytes: 3981365579
tx_queue_5_bytes: 3983235051
tx_queue_6_bytes: 6706236461
tx_queue_7_bytes: 42
tx_queue_8_bytes: 0
tx_queue_9_bytes: 0

txqueues 2 - 6 are mapped to classid 1:3 - 1:7
$ tc -s class show dev $NETDEV
...
class mq 1:3 root leaf 8002:
Sent 11949133672 bytes 7929798 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
class mq 1:4 root leaf 8003:
Sent 7710449050 bytes 5117279 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
class mq 1:5 root leaf 8004:
Sent 4157648675 bytes 2758990 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
class mq 1:6 root leaf 8005:
Sent 4159632195 bytes 2759990 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
class mq 1:7 root leaf 8006:
Sent 7003169603 bytes 4646912 pkt (dropped 0, overlimits 0 requeues 0)
backlog 0b 0p requeues 0
...

Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Cong Wang <xiyou.wangcong@gmail.com>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Jonathan Lemon <jonathan.lemon@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexander Lobakin <alobakin@pm.me>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: Talal Ahmad <talalahmad@google.com>
Cc: Kevin Hao <haokexin@gmail.com>
Cc: Ilias Apalodimas <ilias.apalodimas@linaro.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Cc: Antoine Tenart <atenart@kernel.org>
Cc: Wei Wang <weiwan@google.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>

authored by

Tonghao Zhang and committed by
Paolo Abeni
38a6f086 2f1e85b1

+50 -2
+1
include/net/tc_act/tc_skbedit.h
··· 17 17 u32 mark; 18 18 u32 mask; 19 19 u16 queue_mapping; 20 + u16 mapping_mod; 20 21 u16 ptype; 21 22 struct rcu_head rcu; 22 23 };
+2
include/uapi/linux/tc_act/tc_skbedit.h
··· 29 29 #define SKBEDIT_F_PTYPE 0x8 30 30 #define SKBEDIT_F_MASK 0x10 31 31 #define SKBEDIT_F_INHERITDSFIELD 0x20 32 + #define SKBEDIT_F_TXQ_SKBHASH 0x40 32 33 33 34 struct tc_skbedit { 34 35 tc_gen; ··· 46 45 TCA_SKBEDIT_PTYPE, 47 46 TCA_SKBEDIT_MASK, 48 47 TCA_SKBEDIT_FLAGS, 48 + TCA_SKBEDIT_QUEUE_MAPPING_MAX, 49 49 __TCA_SKBEDIT_MAX 50 50 }; 51 51 #define TCA_SKBEDIT_MAX (__TCA_SKBEDIT_MAX - 1)
+47 -2
net/sched/act_skbedit.c
··· 23 23 static unsigned int skbedit_net_id; 24 24 static struct tc_action_ops act_skbedit_ops; 25 25 26 + static u16 tcf_skbedit_hash(struct tcf_skbedit_params *params, 27 + struct sk_buff *skb) 28 + { 29 + u16 queue_mapping = params->queue_mapping; 30 + 31 + if (params->flags & SKBEDIT_F_TXQ_SKBHASH) { 32 + u32 hash = skb_get_hash(skb); 33 + 34 + queue_mapping += hash % params->mapping_mod; 35 + } 36 + 37 + return netdev_cap_txqueue(skb->dev, queue_mapping); 38 + } 39 + 26 40 static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, 27 41 struct tcf_result *res) 28 42 { ··· 76 62 #ifdef CONFIG_NET_EGRESS 77 63 netdev_xmit_skip_txqueue(true); 78 64 #endif 79 - skb_set_queue_mapping(skb, params->queue_mapping); 65 + skb_set_queue_mapping(skb, tcf_skbedit_hash(params, skb)); 80 66 } 81 67 if (params->flags & SKBEDIT_F_MARK) { 82 68 skb->mark &= ~params->mask; ··· 110 96 [TCA_SKBEDIT_PTYPE] = { .len = sizeof(u16) }, 111 97 [TCA_SKBEDIT_MASK] = { .len = sizeof(u32) }, 112 98 [TCA_SKBEDIT_FLAGS] = { .len = sizeof(u64) }, 99 + [TCA_SKBEDIT_QUEUE_MAPPING_MAX] = { .len = sizeof(u16) }, 113 100 }; 114 101 115 102 static int tcf_skbedit_init(struct net *net, struct nlattr *nla, ··· 127 112 struct tcf_skbedit *d; 128 113 u32 flags = 0, *priority = NULL, *mark = NULL, *mask = NULL; 129 114 u16 *queue_mapping = NULL, *ptype = NULL; 115 + u16 mapping_mod = 1; 130 116 bool exists = false; 131 117 int ret = 0, err; 132 118 u32 index; ··· 173 157 if (tb[TCA_SKBEDIT_FLAGS] != NULL) { 174 158 u64 *pure_flags = nla_data(tb[TCA_SKBEDIT_FLAGS]); 175 159 160 + if (*pure_flags & SKBEDIT_F_TXQ_SKBHASH) { 161 + u16 *queue_mapping_max; 162 + 163 + if (!tb[TCA_SKBEDIT_QUEUE_MAPPING] || 164 + !tb[TCA_SKBEDIT_QUEUE_MAPPING_MAX]) { 165 + NL_SET_ERR_MSG_MOD(extack, "Missing required range of queue_mapping."); 166 + return -EINVAL; 167 + } 168 + 169 + queue_mapping_max = 170 + nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING_MAX]); 171 + if (*queue_mapping_max < *queue_mapping) { 172 + NL_SET_ERR_MSG_MOD(extack, "The range of queue_mapping is invalid, max < min."); 173 + return -EINVAL; 174 + } 175 + 176 + mapping_mod = *queue_mapping_max - *queue_mapping + 1; 177 + flags |= SKBEDIT_F_TXQ_SKBHASH; 178 + } 176 179 if (*pure_flags & SKBEDIT_F_INHERITDSFIELD) 177 180 flags |= SKBEDIT_F_INHERITDSFIELD; 178 181 } ··· 243 208 params_new->flags = flags; 244 209 if (flags & SKBEDIT_F_PRIORITY) 245 210 params_new->priority = *priority; 246 - if (flags & SKBEDIT_F_QUEUE_MAPPING) 211 + if (flags & SKBEDIT_F_QUEUE_MAPPING) { 247 212 params_new->queue_mapping = *queue_mapping; 213 + params_new->mapping_mod = mapping_mod; 214 + } 248 215 if (flags & SKBEDIT_F_MARK) 249 216 params_new->mark = *mark; 250 217 if (flags & SKBEDIT_F_PTYPE) ··· 313 276 goto nla_put_failure; 314 277 if (params->flags & SKBEDIT_F_INHERITDSFIELD) 315 278 pure_flags |= SKBEDIT_F_INHERITDSFIELD; 279 + if (params->flags & SKBEDIT_F_TXQ_SKBHASH) { 280 + if (nla_put_u16(skb, TCA_SKBEDIT_QUEUE_MAPPING_MAX, 281 + params->queue_mapping + params->mapping_mod - 1)) 282 + goto nla_put_failure; 283 + 284 + pure_flags |= SKBEDIT_F_TXQ_SKBHASH; 285 + } 316 286 if (pure_flags != 0 && 317 287 nla_put(skb, TCA_SKBEDIT_FLAGS, sizeof(pure_flags), &pure_flags)) 318 288 goto nla_put_failure; ··· 369 325 return nla_total_size(sizeof(struct tc_skbedit)) 370 326 + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_PRIORITY */ 371 327 + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING */ 328 + + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING_MAX */ 372 329 + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MARK */ 373 330 + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_PTYPE */ 374 331 + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MASK */