Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

add DOVE extensions for VXLAN

This patch provides extensions to VXLAN for supporting Distributed
Overlay Virtual Ethernet (DOVE) networks. The patch includes:

+ a dove flag per VXLAN device to enable DOVE extensions
+ ARP reduction, whereby a bridge-connected VXLAN tunnel endpoint
answers ARP requests from the local bridge on behalf of
remote DOVE clients
+ route short-circuiting (aka L3 switching). Known destination IP
addresses use the corresponding destination MAC address for
switching rather than going to a (possibly remote) router first.
+ netlink notification messages for forwarding table and L3 switching
misses

Changes since v2
- combined bools into "u32 flags"
- replaced loop with !is_zero_ether_addr()

Signed-off-by: David L Stevens <dlstevens@us.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by

David Stevens and committed by
David S. Miller
e4f67add ff33c0e1

+235 -25
+231 -25
drivers/net/vxlan.c
··· 29 29 #include <linux/etherdevice.h> 30 30 #include <linux/if_ether.h> 31 31 #include <linux/hash.h> 32 + #include <net/arp.h> 33 + #include <net/ndisc.h> 32 34 #include <net/ip.h> 33 35 #include <net/icmp.h> 34 36 #include <net/udp.h> ··· 112 110 __u16 port_max; 113 111 __u8 tos; /* TOS override */ 114 112 __u8 ttl; 115 - bool learn; 113 + u32 flags; /* VXLAN_F_* below */ 116 114 117 115 unsigned long age_interval; 118 116 struct timer_list age_timer; ··· 122 120 123 121 struct hlist_head fdb_head[FDB_HASH_SIZE]; 124 122 }; 123 + 124 + #define VXLAN_F_LEARN 0x01 125 + #define VXLAN_F_PROXY 0x02 126 + #define VXLAN_F_RSC 0x04 127 + #define VXLAN_F_L2MISS 0x08 128 + #define VXLAN_F_L3MISS 0x10 125 129 126 130 /* salt for hash table */ 127 131 static u32 vxlan_salt __read_mostly; ··· 162 154 struct nda_cacheinfo ci; 163 155 struct nlmsghdr *nlh; 164 156 struct ndmsg *ndm; 157 + bool send_ip, send_eth; 165 158 166 159 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); 167 160 if (nlh == NULL) ··· 170 161 171 162 ndm = nlmsg_data(nlh); 172 163 memset(ndm, 0, sizeof(*ndm)); 173 - ndm->ndm_family = AF_BRIDGE; 164 + 165 + send_eth = send_ip = true; 166 + 167 + if (type == RTM_GETNEIGH) { 168 + ndm->ndm_family = AF_INET; 169 + send_ip = fdb->remote_ip != 0; 170 + send_eth = !is_zero_ether_addr(fdb->eth_addr); 171 + } else 172 + ndm->ndm_family = AF_BRIDGE; 174 173 ndm->ndm_state = fdb->state; 175 174 ndm->ndm_ifindex = vxlan->dev->ifindex; 176 175 ndm->ndm_flags = NTF_SELF; 177 176 ndm->ndm_type = NDA_DST; 178 177 179 - if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) 178 + if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) 180 179 goto nla_put_failure; 181 180 182 - if (nla_put_be32(skb, NDA_DST, fdb->remote_ip)) 181 + if (send_ip && nla_put_be32(skb, NDA_DST, fdb->remote_ip)) 183 182 goto nla_put_failure; 184 183 185 184 ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ··· 237 220 errout: 238 221 if (err < 0) 239 222 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); 223 + } 224 + 225 + static void vxlan_ip_miss(struct net_device *dev, __be32 ipa) 226 + { 227 + struct vxlan_dev *vxlan = netdev_priv(dev); 228 + struct vxlan_fdb f; 229 + 230 + memset(&f, 0, sizeof f); 231 + f.state = NUD_STALE; 232 + f.remote_ip = ipa; /* goes to NDA_DST */ 233 + 234 + vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); 235 + } 236 + 237 + static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) 238 + { 239 + struct vxlan_fdb f; 240 + 241 + memset(&f, 0, sizeof f); 242 + f.state = NUD_STALE; 243 + memcpy(f.eth_addr, eth_addr, ETH_ALEN); 244 + 245 + vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH); 240 246 } 241 247 242 248 /* Hash Ethernet address */ ··· 591 551 goto drop; 592 552 } 593 553 554 + skb_reset_mac_header(skb); 555 + 594 556 /* Re-examine inner Ethernet packet */ 595 557 oip = ip_hdr(skb); 596 558 skb->protocol = eth_type_trans(skb, vxlan->dev); ··· 602 560 vxlan->dev->dev_addr) == 0) 603 561 goto drop; 604 562 605 - if (vxlan->learn) 563 + if (vxlan->flags & VXLAN_F_LEARN) 606 564 vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source); 607 565 608 566 __skb_tunnel_rx(skb, vxlan->dev); ··· 641 599 return 0; 642 600 } 643 601 602 + static int arp_reduce(struct net_device *dev, struct sk_buff *skb) 603 + { 604 + struct vxlan_dev *vxlan = netdev_priv(dev); 605 + struct arphdr *parp; 606 + u8 *arpptr, *sha; 607 + __be32 sip, tip; 608 + struct neighbour *n; 609 + 610 + if (dev->flags & IFF_NOARP) 611 + goto out; 612 + 613 + if (!pskb_may_pull(skb, arp_hdr_len(dev))) { 614 + dev->stats.tx_dropped++; 615 + goto out; 616 + } 617 + parp = arp_hdr(skb); 618 + 619 + if ((parp->ar_hrd != htons(ARPHRD_ETHER) && 620 + parp->ar_hrd != htons(ARPHRD_IEEE802)) || 621 + parp->ar_pro != htons(ETH_P_IP) || 622 + parp->ar_op != htons(ARPOP_REQUEST) || 623 + parp->ar_hln != dev->addr_len || 624 + parp->ar_pln != 4) 625 + goto out; 626 + arpptr = (u8 *)parp + sizeof(struct arphdr); 627 + sha = arpptr; 628 + arpptr += dev->addr_len; /* sha */ 629 + memcpy(&sip, arpptr, sizeof(sip)); 630 + arpptr += sizeof(sip); 631 + arpptr += dev->addr_len; /* tha */ 632 + memcpy(&tip, arpptr, sizeof(tip)); 633 + 634 + if (ipv4_is_loopback(tip) || 635 + ipv4_is_multicast(tip)) 636 + goto out; 637 + 638 + n = neigh_lookup(&arp_tbl, &tip, dev); 639 + 640 + if (n) { 641 + struct vxlan_dev *vxlan = netdev_priv(dev); 642 + struct vxlan_fdb *f; 643 + struct sk_buff *reply; 644 + 645 + if (!(n->nud_state & NUD_CONNECTED)) { 646 + neigh_release(n); 647 + goto out; 648 + } 649 + 650 + f = vxlan_find_mac(vxlan, n->ha); 651 + if (f && f->remote_ip == 0) { 652 + /* bridge-local neighbor */ 653 + neigh_release(n); 654 + goto out; 655 + } 656 + 657 + reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, 658 + n->ha, sha); 659 + 660 + neigh_release(n); 661 + 662 + skb_reset_mac_header(reply); 663 + __skb_pull(reply, skb_network_offset(reply)); 664 + reply->ip_summed = CHECKSUM_UNNECESSARY; 665 + reply->pkt_type = PACKET_HOST; 666 + 667 + if (netif_rx_ni(reply) == NET_RX_DROP) 668 + dev->stats.rx_dropped++; 669 + } else if (vxlan->flags & VXLAN_F_L3MISS) 670 + vxlan_ip_miss(dev, tip); 671 + out: 672 + consume_skb(skb); 673 + return NETDEV_TX_OK; 674 + } 675 + 676 + static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) 677 + { 678 + struct vxlan_dev *vxlan = netdev_priv(dev); 679 + struct neighbour *n; 680 + struct iphdr *pip; 681 + 682 + if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) 683 + return false; 684 + 685 + n = NULL; 686 + switch (ntohs(eth_hdr(skb)->h_proto)) { 687 + case ETH_P_IP: 688 + if (!pskb_may_pull(skb, sizeof(struct iphdr))) 689 + return false; 690 + pip = ip_hdr(skb); 691 + n = neigh_lookup(&arp_tbl, &pip->daddr, dev); 692 + break; 693 + default: 694 + return false; 695 + } 696 + 697 + if (n) { 698 + bool diff; 699 + 700 + diff = compare_ether_addr(eth_hdr(skb)->h_dest, n->ha) != 0; 701 + if (diff) { 702 + memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 703 + dev->addr_len); 704 + memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len); 705 + } 706 + neigh_release(n); 707 + return diff; 708 + } else if (vxlan->flags & VXLAN_F_L3MISS) 709 + vxlan_ip_miss(dev, pip->daddr); 710 + return false; 711 + } 712 + 644 713 /* Extract dsfield from inner protocol */ 645 714 static inline u8 vxlan_get_dsfield(const struct iphdr *iph, 646 715 const struct sk_buff *skb) ··· 772 619 u8 inner = vxlan_get_dsfield(iph, skb); 773 620 774 621 return INET_ECN_encapsulate(tos, inner); 775 - } 776 - 777 - static __be32 vxlan_find_dst(struct vxlan_dev *vxlan, struct sk_buff *skb) 778 - { 779 - const struct ethhdr *eth = (struct ethhdr *) skb->data; 780 - const struct vxlan_fdb *f; 781 - 782 - if (is_multicast_ether_addr(eth->h_dest)) 783 - return vxlan->gaddr; 784 - 785 - f = vxlan_find_mac(vxlan, eth->h_dest); 786 - if (f) 787 - return f->remote_ip; 788 - else 789 - return vxlan->gaddr; 790 - 791 622 } 792 623 793 624 static void vxlan_sock_free(struct sk_buff *skb) ··· 820 683 struct vxlan_dev *vxlan = netdev_priv(dev); 821 684 struct rtable *rt; 822 685 const struct iphdr *old_iph; 686 + struct ethhdr *eth; 823 687 struct iphdr *iph; 824 688 struct vxlanhdr *vxh; 825 689 struct udphdr *uh; ··· 831 693 __be16 df = 0; 832 694 __u8 tos, ttl; 833 695 int err; 696 + bool did_rsc = false; 697 + const struct vxlan_fdb *f; 834 698 835 - dst = vxlan_find_dst(vxlan, skb); 836 - if (!dst) 699 + skb_reset_mac_header(skb); 700 + eth = eth_hdr(skb); 701 + 702 + if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP) 703 + return arp_reduce(dev, skb); 704 + else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP) 705 + did_rsc = route_shortcircuit(dev, skb); 706 + 707 + f = vxlan_find_mac(vxlan, eth->h_dest); 708 + if (f == NULL) { 709 + did_rsc = false; 710 + dst = vxlan->gaddr; 711 + if (!dst && (vxlan->flags & VXLAN_F_L2MISS) && 712 + !is_multicast_ether_addr(eth->h_dest)) 713 + vxlan_fdb_miss(vxlan, eth->h_dest); 714 + } else 715 + dst = f->remote_ip; 716 + 717 + if (!dst) { 718 + if (did_rsc) { 719 + __skb_pull(skb, skb_network_offset(skb)); 720 + skb->ip_summed = CHECKSUM_NONE; 721 + skb->pkt_type = PACKET_HOST; 722 + 723 + /* short-circuited back to local bridge */ 724 + if (netif_rx(skb) == NET_RX_SUCCESS) { 725 + struct vxlan_stats *stats = 726 + this_cpu_ptr(vxlan->stats); 727 + 728 + u64_stats_update_begin(&stats->syncp); 729 + stats->tx_packets++; 730 + stats->tx_bytes += pkt_len; 731 + u64_stats_update_end(&stats->syncp); 732 + } else { 733 + dev->stats.tx_errors++; 734 + dev->stats.tx_aborted_errors++; 735 + } 736 + return NETDEV_TX_OK; 737 + } 837 738 goto drop; 739 + } 838 740 839 741 /* Need space for new headers (invalidates iph ptr) */ 840 742 if (skb_cow_head(skb, VXLAN_HEADROOM)) ··· 1197 1019 [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, 1198 1020 [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, 1199 1021 [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, 1022 + [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, 1023 + [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, 1024 + [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, 1025 + [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, 1200 1026 }; 1201 1027 1202 1028 static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) ··· 1296 1114 vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); 1297 1115 1298 1116 if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING])) 1299 - vxlan->learn = true; 1117 + vxlan->flags |= VXLAN_F_LEARN; 1300 1118 1301 1119 if (data[IFLA_VXLAN_AGEING]) 1302 1120 vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); 1303 1121 else 1304 1122 vxlan->age_interval = FDB_AGE_DEFAULT; 1123 + 1124 + if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY])) 1125 + vxlan->flags |= VXLAN_F_PROXY; 1126 + 1127 + if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC])) 1128 + vxlan->flags |= VXLAN_F_RSC; 1129 + 1130 + if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS])) 1131 + vxlan->flags |= VXLAN_F_L2MISS; 1132 + 1133 + if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS])) 1134 + vxlan->flags |= VXLAN_F_L3MISS; 1305 1135 1306 1136 if (data[IFLA_VXLAN_LIMIT]) 1307 1137 vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); ··· 1351 1157 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ 1352 1158 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ 1353 1159 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ 1160 + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ 1161 + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ 1162 + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ 1163 + nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ 1354 1164 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ 1355 1165 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ 1356 1166 nla_total_size(sizeof(struct ifla_vxlan_port_range)) + ··· 1383 1185 1384 1186 if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) || 1385 1187 nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) || 1386 - nla_put_u8(skb, IFLA_VXLAN_LEARNING, vxlan->learn) || 1188 + nla_put_u8(skb, IFLA_VXLAN_LEARNING, 1189 + !!(vxlan->flags & VXLAN_F_LEARN)) || 1190 + nla_put_u8(skb, IFLA_VXLAN_PROXY, 1191 + !!(vxlan->flags & VXLAN_F_PROXY)) || 1192 + nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) || 1193 + nla_put_u8(skb, IFLA_VXLAN_L2MISS, 1194 + !!(vxlan->flags & VXLAN_F_L2MISS)) || 1195 + nla_put_u8(skb, IFLA_VXLAN_L3MISS, 1196 + !!(vxlan->flags & VXLAN_F_L3MISS)) || 1387 1197 nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) || 1388 1198 nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax)) 1389 1199 goto nla_put_failure;
+4
include/uapi/linux/if_link.h
··· 302 302 IFLA_VXLAN_AGEING, 303 303 IFLA_VXLAN_LIMIT, 304 304 IFLA_VXLAN_PORT_RANGE, 305 + IFLA_VXLAN_PROXY, 306 + IFLA_VXLAN_RSC, 307 + IFLA_VXLAN_L2MISS, 308 + IFLA_VXLAN_L3MISS, 305 309 __IFLA_VXLAN_MAX 306 310 }; 307 311 #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)