Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

bridge: Add backup nexthop ID support

Add a new bridge port attribute that allows attaching a nexthop object
ID to an skb that is redirected to a backup bridge port with VLAN
tunneling enabled.

Specifically, when redirecting a known unicast packet, read the backup
nexthop ID from the bridge port that lost its carrier and set it in the
bridge control block of the skb before forwarding it via the backup
port. Note that reading the ID from the bridge port should not result in
a cache miss as the ID is added next to the 'backup_port' field that was
already accessed. After this change, the 'state' field still stays on
the first cache line, together with other data path related fields such
as 'flags and 'vlgrp':

struct net_bridge_port {
struct net_bridge * br; /* 0 8 */
struct net_device * dev; /* 8 8 */
netdevice_tracker dev_tracker; /* 16 0 */
struct list_head list; /* 16 16 */
long unsigned int flags; /* 32 8 */
struct net_bridge_vlan_group * vlgrp; /* 40 8 */
struct net_bridge_port * backup_port; /* 48 8 */
u32 backup_nhid; /* 56 4 */
u8 priority; /* 60 1 */
u8 state; /* 61 1 */
u16 port_no; /* 62 2 */
/* --- cacheline 1 boundary (64 bytes) --- */
[...]
} __attribute__((__aligned__(8)));

When forwarding an skb via a bridge port that has VLAN tunneling
enabled, check if the backup nexthop ID stored in the bridge control
block is valid (i.e., not zero). If so, instead of attaching the
pre-allocated metadata (that only has the tunnel key set), allocate a
new metadata, set both the tunnel key and the nexthop object ID and
attach it to the skb.

By default, do not dump the new attribute to user space as a value of
zero is an invalid nexthop object ID.

The above is useful for EVPN multihoming. When one of the links
composing an Ethernet Segment (ES) fails, traffic needs to be redirected
towards the host via one of the other ES peers. For example, if a host
is multihomed to three different VTEPs, the backup port of each ES link
needs to be set to the VXLAN device and the backup nexthop ID needs to
point to an FDB nexthop group that includes the IP addresses of the
other two VTEPs. The VXLAN driver will extract the ID from the metadata
of the redirected skb, calculate its flow hash and forward it towards
one of the other VTEPs. If the ID does not exist, or represents an
invalid nexthop object, the VXLAN driver will drop the skb. This
relieves the bridge driver from the need to validate the ID.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

Ido Schimmel and committed by
David S. Miller
29cfb2aa d977e1c8

+33 -1
+1
include/uapi/linux/if_link.h
··· 570 570 IFLA_BRPORT_MCAST_N_GROUPS, 571 571 IFLA_BRPORT_MCAST_MAX_GROUPS, 572 572 IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, 573 + IFLA_BRPORT_BACKUP_NHID, 573 574 __IFLA_BRPORT_MAX 574 575 }; 575 576 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
+1
net/bridge/br_forward.c
··· 154 154 backup_port = rcu_dereference(to->backup_port); 155 155 if (unlikely(!backup_port)) 156 156 goto out; 157 + BR_INPUT_SKB_CB(skb)->backup_nhid = READ_ONCE(to->backup_nhid); 157 158 to = backup_port; 158 159 } 159 160
+12
net/bridge/br_netlink.c
··· 211 211 + nla_total_size(sizeof(u8)) /* IFLA_BRPORT_MRP_IN_OPEN */ 212 212 + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT */ 213 213 + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_MCAST_EHT_HOSTS_CNT */ 214 + + nla_total_size(sizeof(u32)) /* IFLA_BRPORT_BACKUP_NHID */ 214 215 + 0; 215 216 } 216 217 ··· 319 318 nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT, 320 319 backup_p->dev->ifindex); 321 320 rcu_read_unlock(); 321 + 322 + if (p->backup_nhid && 323 + nla_put_u32(skb, IFLA_BRPORT_BACKUP_NHID, p->backup_nhid)) 324 + return -EMSGSIZE; 322 325 323 326 return 0; 324 327 } ··· 900 895 [IFLA_BRPORT_MCAST_N_GROUPS] = { .type = NLA_REJECT }, 901 896 [IFLA_BRPORT_MCAST_MAX_GROUPS] = { .type = NLA_U32 }, 902 897 [IFLA_BRPORT_NEIGH_VLAN_SUPPRESS] = NLA_POLICY_MAX(NLA_U8, 1), 898 + [IFLA_BRPORT_BACKUP_NHID] = { .type = NLA_U32 }, 903 899 }; 904 900 905 901 /* Change the state of the port and notify spanning tree */ ··· 1069 1063 err = nbp_backup_change(p, backup_dev); 1070 1064 if (err) 1071 1065 return err; 1066 + } 1067 + 1068 + if (tb[IFLA_BRPORT_BACKUP_NHID]) { 1069 + u32 backup_nhid = nla_get_u32(tb[IFLA_BRPORT_BACKUP_NHID]); 1070 + 1071 + WRITE_ONCE(p->backup_nhid, backup_nhid); 1072 1072 } 1073 1073 1074 1074 return 0;
+3
net/bridge/br_private.h
··· 387 387 struct net_bridge_vlan_group __rcu *vlgrp; 388 388 #endif 389 389 struct net_bridge_port __rcu *backup_port; 390 + u32 backup_nhid; 390 391 391 392 /* STP */ 392 393 u8 priority; ··· 606 605 */ 607 606 unsigned long fwd_hwdoms; 608 607 #endif 608 + 609 + u32 backup_nhid; 609 610 }; 610 611 611 612 #define BR_INPUT_SKB_CB(__skb) ((struct br_input_skb_cb *)(__skb)->cb)
+15
net/bridge/br_vlan_tunnel.c
··· 201 201 if (err) 202 202 return err; 203 203 204 + if (BR_INPUT_SKB_CB(skb)->backup_nhid) { 205 + tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, 206 + tunnel_id, 0); 207 + if (!tunnel_dst) 208 + return -ENOMEM; 209 + 210 + tunnel_dst->u.tun_info.mode |= IP_TUNNEL_INFO_TX | 211 + IP_TUNNEL_INFO_BRIDGE; 212 + tunnel_dst->u.tun_info.key.nhid = 213 + BR_INPUT_SKB_CB(skb)->backup_nhid; 214 + skb_dst_set(skb, &tunnel_dst->dst); 215 + 216 + return 0; 217 + } 218 + 204 219 tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); 205 220 if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) 206 221 skb_dst_set(skb, &tunnel_dst->dst);
+1 -1
net/core/rtnetlink.c
··· 61 61 #include "dev.h" 62 62 63 63 #define RTNL_MAX_TYPE 50 64 - #define RTNL_SLAVE_MAX_TYPE 43 64 + #define RTNL_SLAVE_MAX_TYPE 44 65 65 66 66 struct rtnl_link { 67 67 rtnl_doit_func doit;