Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

skbuff: bridge: Add layer 2 miss indication

For EVPN non-DF (Designated Forwarder) filtering we need to be able to
prevent decapsulated traffic from being flooded to a multi-homed host.
Filtering of multicast and broadcast traffic can be achieved using the
following flower filter:

# tc filter add dev bond0 egress pref 1 proto all flower indev vxlan0 dst_mac 01:00:00:00:00:00/01:00:00:00:00:00 action drop

Unlike broadcast and multicast traffic, it is not currently possible to
filter unknown unicast traffic. The classification into unknown unicast
is performed by the bridge driver, but is not visible to other layers
such as tc.

Solve this by adding a new 'l2_miss' bit to the tc skb extension. Clear
the bit whenever a packet enters the bridge (received from a bridge port
or transmitted via the bridge) and set it if the packet did not match an
FDB or MDB entry. If there is no skb extension and the bit needs to be
cleared, then do not allocate one as no extension is equivalent to the
bit being cleared. The bit is not set for broadcast packets as they
never perform a lookup and therefore never incur a miss.

A bit that is set for every flooded packet would also work for the
current use case, but it does not allow us to differentiate between
registered and unregistered multicast traffic, which might be useful in
the future.

To keep the performance impact to a minimum, the marking of packets is
guarded by the 'tc_skb_ext_tc' static key. When 'false', the skb is not
touched and an skb extension is not allocated. Instead, only a
5 bytes nop is executed, as demonstrated below for the call site in
br_handle_frame().

Before the patch:

```
memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
c37b09: 49 c7 44 24 28 00 00 movq $0x0,0x28(%r12)
c37b10: 00 00

p = br_port_get_rcu(skb->dev);
c37b12: 49 8b 44 24 10 mov 0x10(%r12),%rax
memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
c37b17: 49 c7 44 24 30 00 00 movq $0x0,0x30(%r12)
c37b1e: 00 00
c37b20: 49 c7 44 24 38 00 00 movq $0x0,0x38(%r12)
c37b27: 00 00
```

After the patch (when static key is disabled):

```
memset(skb->cb, 0, sizeof(struct br_input_skb_cb));
c37c29: 49 c7 44 24 28 00 00 movq $0x0,0x28(%r12)
c37c30: 00 00
c37c32: 49 8d 44 24 28 lea 0x28(%r12),%rax
c37c37: 48 c7 40 08 00 00 00 movq $0x0,0x8(%rax)
c37c3e: 00
c37c3f: 48 c7 40 10 00 00 00 movq $0x0,0x10(%rax)
c37c46: 00

#ifdef CONFIG_HAVE_JUMP_LABEL_HACK

static __always_inline bool arch_static_branch(struct static_key *key, bool branch)
{
asm_volatile_goto("1:"
c37c47: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
br_tc_skb_miss_set(skb, false);

p = br_port_get_rcu(skb->dev);
c37c4c: 49 8b 44 24 10 mov 0x10(%r12),%rax
```

Subsequent patches will extend the flower classifier to be able to match
on the new 'l2_miss' bit and enable / disable the static key when
filters that match on it are added / deleted.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

Ido Schimmel and committed by
Jakub Kicinski
7b4858df 2e246bca

+33
+1
include/linux/skbuff.h
··· 330 330 u8 post_ct_snat:1; 331 331 u8 post_ct_dnat:1; 332 332 u8 act_miss:1; /* Set if act_miss_cookie is used */ 333 + u8 l2_miss:1; /* Set by bridge upon FDB or MDB miss */ 333 334 }; 334 335 #endif 335 336
+1
net/bridge/br_device.c
··· 39 39 u16 vid = 0; 40 40 41 41 memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); 42 + br_tc_skb_miss_set(skb, false); 42 43 43 44 rcu_read_lock(); 44 45 nf_ops = rcu_dereference(nf_br_ops);
+3
net/bridge/br_forward.c
··· 203 203 struct net_bridge_port *prev = NULL; 204 204 struct net_bridge_port *p; 205 205 206 + br_tc_skb_miss_set(skb, pkt_type != BR_PKT_BROADCAST); 207 + 206 208 list_for_each_entry_rcu(p, &br->port_list, list) { 207 209 /* Do not flood unicast traffic to ports that turn it off, nor 208 210 * other traffic if flood off, except for traffic we originate ··· 297 295 allow_mode_include = false; 298 296 } else { 299 297 p = NULL; 298 + br_tc_skb_miss_set(skb, true); 300 299 } 301 300 302 301 while (p || rp) {
+1
net/bridge/br_input.c
··· 334 334 return RX_HANDLER_CONSUMED; 335 335 336 336 memset(skb->cb, 0, sizeof(struct br_input_skb_cb)); 337 + br_tc_skb_miss_set(skb, false); 337 338 338 339 p = br_port_get_rcu(skb->dev); 339 340 if (p->flags & BR_VLAN_TUNNEL)
+27
net/bridge/br_private.h
··· 15 15 #include <linux/u64_stats_sync.h> 16 16 #include <net/route.h> 17 17 #include <net/ip6_fib.h> 18 + #include <net/pkt_cls.h> 18 19 #include <linux/if_vlan.h> 19 20 #include <linux/rhashtable.h> 20 21 #include <linux/refcount.h> ··· 754 753 void br_boolopt_multi_get(const struct net_bridge *br, 755 754 struct br_boolopt_multi *bm); 756 755 void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on); 756 + 757 + #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) 758 + static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss) 759 + { 760 + struct tc_skb_ext *ext; 761 + 762 + if (!tc_skb_ext_tc_enabled()) 763 + return; 764 + 765 + ext = skb_ext_find(skb, TC_SKB_EXT); 766 + if (ext) { 767 + ext->l2_miss = miss; 768 + return; 769 + } 770 + if (!miss) 771 + return; 772 + ext = tc_skb_ext_alloc(skb); 773 + if (!ext) 774 + return; 775 + ext->l2_miss = true; 776 + } 777 + #else 778 + static inline void br_tc_skb_miss_set(struct sk_buff *skb, bool miss) 779 + { 780 + } 781 + #endif 757 782 758 783 /* br_device.c */ 759 784 void br_dev_setup(struct net_device *dev);