Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.18 2845 lines 73 kB view raw
1/* 2 * VXLAN: Virtual eXtensible Local Area Network 3 * 4 * Copyright (c) 2012-2013 Vyatta Inc. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13#include <linux/kernel.h> 14#include <linux/types.h> 15#include <linux/module.h> 16#include <linux/errno.h> 17#include <linux/slab.h> 18#include <linux/skbuff.h> 19#include <linux/rculist.h> 20#include <linux/netdevice.h> 21#include <linux/in.h> 22#include <linux/ip.h> 23#include <linux/udp.h> 24#include <linux/igmp.h> 25#include <linux/etherdevice.h> 26#include <linux/if_ether.h> 27#include <linux/if_vlan.h> 28#include <linux/hash.h> 29#include <linux/ethtool.h> 30#include <net/arp.h> 31#include <net/ndisc.h> 32#include <net/ip.h> 33#include <net/ip_tunnels.h> 34#include <net/icmp.h> 35#include <net/udp.h> 36#include <net/udp_tunnel.h> 37#include <net/rtnetlink.h> 38#include <net/route.h> 39#include <net/dsfield.h> 40#include <net/inet_ecn.h> 41#include <net/net_namespace.h> 42#include <net/netns/generic.h> 43#include <net/vxlan.h> 44#include <net/protocol.h> 45#include <net/udp_tunnel.h> 46#if IS_ENABLED(CONFIG_IPV6) 47#include <net/ipv6.h> 48#include <net/addrconf.h> 49#include <net/ip6_tunnel.h> 50#include <net/ip6_checksum.h> 51#endif 52 53#define VXLAN_VERSION "0.1" 54 55#define PORT_HASH_BITS 8 56#define PORT_HASH_SIZE (1<<PORT_HASH_BITS) 57#define VNI_HASH_BITS 10 58#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) 59#define FDB_HASH_BITS 8 60#define FDB_HASH_SIZE (1<<FDB_HASH_BITS) 61#define FDB_AGE_DEFAULT 300 /* 5 min */ 62#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ 63 64#define VXLAN_N_VID (1u << 24) 65#define VXLAN_VID_MASK (VXLAN_N_VID - 1) 66#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) 67 68#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ 69 70/* UDP port for VXLAN traffic. 71 * The IANA assigned port is 4789, but the Linux default is 8472 72 * for compatibility with early adopters. 73 */ 74static unsigned short vxlan_port __read_mostly = 8472; 75module_param_named(udp_port, vxlan_port, ushort, 0444); 76MODULE_PARM_DESC(udp_port, "Destination UDP port"); 77 78static bool log_ecn_error = true; 79module_param(log_ecn_error, bool, 0644); 80MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 81 82static int vxlan_net_id; 83 84static const u8 all_zeros_mac[ETH_ALEN]; 85 86/* per-network namespace private data for this module */ 87struct vxlan_net { 88 struct list_head vxlan_list; 89 struct hlist_head sock_list[PORT_HASH_SIZE]; 90 spinlock_t sock_lock; 91}; 92 93union vxlan_addr { 94 struct sockaddr_in sin; 95 struct sockaddr_in6 sin6; 96 struct sockaddr sa; 97}; 98 99struct vxlan_rdst { 100 union vxlan_addr remote_ip; 101 __be16 remote_port; 102 u32 remote_vni; 103 u32 remote_ifindex; 104 struct list_head list; 105 struct rcu_head rcu; 106}; 107 108/* Forwarding table entry */ 109struct vxlan_fdb { 110 struct hlist_node hlist; /* linked list of entries */ 111 struct rcu_head rcu; 112 unsigned long updated; /* jiffies */ 113 unsigned long used; 114 struct list_head remotes; 115 u16 state; /* see ndm_state */ 116 u8 flags; /* see ndm_flags */ 117 u8 eth_addr[ETH_ALEN]; 118}; 119 120/* Pseudo network device */ 121struct vxlan_dev { 122 struct hlist_node hlist; /* vni hash table */ 123 struct list_head next; /* vxlan's per namespace list */ 124 struct vxlan_sock *vn_sock; /* listening socket */ 125 struct net_device *dev; 126 struct net *net; /* netns for packet i/o */ 127 struct vxlan_rdst default_dst; /* default destination */ 128 union vxlan_addr saddr; /* source address */ 129 __be16 dst_port; 130 __u16 port_min; /* source port range */ 131 __u16 port_max; 132 __u8 tos; /* TOS override */ 133 __u8 ttl; 134 u32 flags; /* VXLAN_F_* in vxlan.h */ 135 136 struct work_struct sock_work; 137 struct work_struct igmp_join; 138 struct work_struct igmp_leave; 139 140 unsigned long age_interval; 141 struct timer_list age_timer; 142 spinlock_t hash_lock; 143 unsigned int addrcnt; 144 unsigned int addrmax; 145 146 struct hlist_head fdb_head[FDB_HASH_SIZE]; 147}; 148 149/* salt for hash table */ 150static u32 vxlan_salt __read_mostly; 151static struct workqueue_struct *vxlan_wq; 152 153static void vxlan_sock_work(struct work_struct *work); 154 155#if IS_ENABLED(CONFIG_IPV6) 156static inline 157bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) 158{ 159 if (a->sa.sa_family != b->sa.sa_family) 160 return false; 161 if (a->sa.sa_family == AF_INET6) 162 return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); 163 else 164 return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; 165} 166 167static inline bool vxlan_addr_any(const union vxlan_addr *ipa) 168{ 169 if (ipa->sa.sa_family == AF_INET6) 170 return ipv6_addr_any(&ipa->sin6.sin6_addr); 171 else 172 return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); 173} 174 175static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) 176{ 177 if (ipa->sa.sa_family == AF_INET6) 178 return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr); 179 else 180 return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); 181} 182 183static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) 184{ 185 if (nla_len(nla) >= sizeof(struct in6_addr)) { 186 nla_memcpy(&ip->sin6.sin6_addr, nla, sizeof(struct in6_addr)); 187 ip->sa.sa_family = AF_INET6; 188 return 0; 189 } else if (nla_len(nla) >= sizeof(__be32)) { 190 ip->sin.sin_addr.s_addr = nla_get_be32(nla); 191 ip->sa.sa_family = AF_INET; 192 return 0; 193 } else { 194 return -EAFNOSUPPORT; 195 } 196} 197 198static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, 199 const union vxlan_addr *ip) 200{ 201 if (ip->sa.sa_family == AF_INET6) 202 return nla_put(skb, attr, sizeof(struct in6_addr), &ip->sin6.sin6_addr); 203 else 204 return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr); 205} 206 207#else /* !CONFIG_IPV6 */ 208 209static inline 210bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) 211{ 212 return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; 213} 214 215static inline bool vxlan_addr_any(const union vxlan_addr *ipa) 216{ 217 return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); 218} 219 220static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) 221{ 222 return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); 223} 224 225static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) 226{ 227 if (nla_len(nla) >= sizeof(struct in6_addr)) { 228 return -EAFNOSUPPORT; 229 } else if (nla_len(nla) >= sizeof(__be32)) { 230 ip->sin.sin_addr.s_addr = nla_get_be32(nla); 231 ip->sa.sa_family = AF_INET; 232 return 0; 233 } else { 234 return -EAFNOSUPPORT; 235 } 236} 237 238static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, 239 const union vxlan_addr *ip) 240{ 241 return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr); 242} 243#endif 244 245/* Virtual Network hash table head */ 246static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id) 247{ 248 return &vs->vni_list[hash_32(id, VNI_HASH_BITS)]; 249} 250 251/* Socket hash table head */ 252static inline struct hlist_head *vs_head(struct net *net, __be16 port) 253{ 254 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 255 256 return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; 257} 258 259/* First remote destination for a forwarding entry. 260 * Guaranteed to be non-NULL because remotes are never deleted. 261 */ 262static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) 263{ 264 return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); 265} 266 267static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) 268{ 269 return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); 270} 271 272/* Find VXLAN socket based on network namespace, address family and UDP port */ 273static struct vxlan_sock *vxlan_find_sock(struct net *net, 274 sa_family_t family, __be16 port) 275{ 276 struct vxlan_sock *vs; 277 278 hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { 279 if (inet_sk(vs->sock->sk)->inet_sport == port && 280 inet_sk(vs->sock->sk)->sk.sk_family == family) 281 return vs; 282 } 283 return NULL; 284} 285 286static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id) 287{ 288 struct vxlan_dev *vxlan; 289 290 hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) { 291 if (vxlan->default_dst.remote_vni == id) 292 return vxlan; 293 } 294 295 return NULL; 296} 297 298/* Look up VNI in a per net namespace table */ 299static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, 300 sa_family_t family, __be16 port) 301{ 302 struct vxlan_sock *vs; 303 304 vs = vxlan_find_sock(net, family, port); 305 if (!vs) 306 return NULL; 307 308 return vxlan_vs_find_vni(vs, id); 309} 310 311/* Fill in neighbour message in skbuff. */ 312static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, 313 const struct vxlan_fdb *fdb, 314 u32 portid, u32 seq, int type, unsigned int flags, 315 const struct vxlan_rdst *rdst) 316{ 317 unsigned long now = jiffies; 318 struct nda_cacheinfo ci; 319 struct nlmsghdr *nlh; 320 struct ndmsg *ndm; 321 bool send_ip, send_eth; 322 323 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); 324 if (nlh == NULL) 325 return -EMSGSIZE; 326 327 ndm = nlmsg_data(nlh); 328 memset(ndm, 0, sizeof(*ndm)); 329 330 send_eth = send_ip = true; 331 332 if (type == RTM_GETNEIGH) { 333 ndm->ndm_family = AF_INET; 334 send_ip = !vxlan_addr_any(&rdst->remote_ip); 335 send_eth = !is_zero_ether_addr(fdb->eth_addr); 336 } else 337 ndm->ndm_family = AF_BRIDGE; 338 ndm->ndm_state = fdb->state; 339 ndm->ndm_ifindex = vxlan->dev->ifindex; 340 ndm->ndm_flags = fdb->flags; 341 ndm->ndm_type = RTN_UNICAST; 342 343 if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) 344 goto nla_put_failure; 345 346 if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) 347 goto nla_put_failure; 348 349 if (rdst->remote_port && rdst->remote_port != vxlan->dst_port && 350 nla_put_be16(skb, NDA_PORT, rdst->remote_port)) 351 goto nla_put_failure; 352 if (rdst->remote_vni != vxlan->default_dst.remote_vni && 353 nla_put_u32(skb, NDA_VNI, rdst->remote_vni)) 354 goto nla_put_failure; 355 if (rdst->remote_ifindex && 356 nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) 357 goto nla_put_failure; 358 359 ci.ndm_used = jiffies_to_clock_t(now - fdb->used); 360 ci.ndm_confirmed = 0; 361 ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); 362 ci.ndm_refcnt = 0; 363 364 if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) 365 goto nla_put_failure; 366 367 return nlmsg_end(skb, nlh); 368 369nla_put_failure: 370 nlmsg_cancel(skb, nlh); 371 return -EMSGSIZE; 372} 373 374static inline size_t vxlan_nlmsg_size(void) 375{ 376 return NLMSG_ALIGN(sizeof(struct ndmsg)) 377 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ 378 + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ 379 + nla_total_size(sizeof(__be16)) /* NDA_PORT */ 380 + nla_total_size(sizeof(__be32)) /* NDA_VNI */ 381 + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ 382 + nla_total_size(sizeof(struct nda_cacheinfo)); 383} 384 385static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, 386 struct vxlan_rdst *rd, int type) 387{ 388 struct net *net = dev_net(vxlan->dev); 389 struct sk_buff *skb; 390 int err = -ENOBUFS; 391 392 skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); 393 if (skb == NULL) 394 goto errout; 395 396 err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd); 397 if (err < 0) { 398 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ 399 WARN_ON(err == -EMSGSIZE); 400 kfree_skb(skb); 401 goto errout; 402 } 403 404 rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); 405 return; 406errout: 407 if (err < 0) 408 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); 409} 410 411static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa) 412{ 413 struct vxlan_dev *vxlan = netdev_priv(dev); 414 struct vxlan_fdb f = { 415 .state = NUD_STALE, 416 }; 417 struct vxlan_rdst remote = { 418 .remote_ip = *ipa, /* goes to NDA_DST */ 419 .remote_vni = VXLAN_N_VID, 420 }; 421 422 vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH); 423} 424 425static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) 426{ 427 struct vxlan_fdb f = { 428 .state = NUD_STALE, 429 }; 430 struct vxlan_rdst remote = { }; 431 432 memcpy(f.eth_addr, eth_addr, ETH_ALEN); 433 434 vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH); 435} 436 437/* Hash Ethernet address */ 438static u32 eth_hash(const unsigned char *addr) 439{ 440 u64 value = get_unaligned((u64 *)addr); 441 442 /* only want 6 bytes */ 443#ifdef __BIG_ENDIAN 444 value >>= 16; 445#else 446 value <<= 16; 447#endif 448 return hash_64(value, FDB_HASH_BITS); 449} 450 451/* Hash chain to use given mac address */ 452static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, 453 const u8 *mac) 454{ 455 return &vxlan->fdb_head[eth_hash(mac)]; 456} 457 458/* Look up Ethernet address in forwarding table */ 459static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, 460 const u8 *mac) 461{ 462 struct hlist_head *head = vxlan_fdb_head(vxlan, mac); 463 struct vxlan_fdb *f; 464 465 hlist_for_each_entry_rcu(f, head, hlist) { 466 if (ether_addr_equal(mac, f->eth_addr)) 467 return f; 468 } 469 470 return NULL; 471} 472 473static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, 474 const u8 *mac) 475{ 476 struct vxlan_fdb *f; 477 478 f = __vxlan_find_mac(vxlan, mac); 479 if (f) 480 f->used = jiffies; 481 482 return f; 483} 484 485/* caller should hold vxlan->hash_lock */ 486static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, 487 union vxlan_addr *ip, __be16 port, 488 __u32 vni, __u32 ifindex) 489{ 490 struct vxlan_rdst *rd; 491 492 list_for_each_entry(rd, &f->remotes, list) { 493 if (vxlan_addr_equal(&rd->remote_ip, ip) && 494 rd->remote_port == port && 495 rd->remote_vni == vni && 496 rd->remote_ifindex == ifindex) 497 return rd; 498 } 499 500 return NULL; 501} 502 503/* Replace destination of unicast mac */ 504static int vxlan_fdb_replace(struct vxlan_fdb *f, 505 union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex) 506{ 507 struct vxlan_rdst *rd; 508 509 rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); 510 if (rd) 511 return 0; 512 513 rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); 514 if (!rd) 515 return 0; 516 rd->remote_ip = *ip; 517 rd->remote_port = port; 518 rd->remote_vni = vni; 519 rd->remote_ifindex = ifindex; 520 return 1; 521} 522 523/* Add/update destinations for multicast */ 524static int vxlan_fdb_append(struct vxlan_fdb *f, 525 union vxlan_addr *ip, __be16 port, __u32 vni, 526 __u32 ifindex, struct vxlan_rdst **rdp) 527{ 528 struct vxlan_rdst *rd; 529 530 rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); 531 if (rd) 532 return 0; 533 534 rd = kmalloc(sizeof(*rd), GFP_ATOMIC); 535 if (rd == NULL) 536 return -ENOBUFS; 537 rd->remote_ip = *ip; 538 rd->remote_port = port; 539 rd->remote_vni = vni; 540 rd->remote_ifindex = ifindex; 541 542 list_add_tail_rcu(&rd->list, &f->remotes); 543 544 *rdp = rd; 545 return 1; 546} 547 548static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb) 549{ 550 struct sk_buff *p, **pp = NULL; 551 struct vxlanhdr *vh, *vh2; 552 struct ethhdr *eh, *eh2; 553 unsigned int hlen, off_vx, off_eth; 554 const struct packet_offload *ptype; 555 __be16 type; 556 int flush = 1; 557 558 off_vx = skb_gro_offset(skb); 559 hlen = off_vx + sizeof(*vh); 560 vh = skb_gro_header_fast(skb, off_vx); 561 if (skb_gro_header_hard(skb, hlen)) { 562 vh = skb_gro_header_slow(skb, hlen, off_vx); 563 if (unlikely(!vh)) 564 goto out; 565 } 566 skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ 567 skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); 568 569 off_eth = skb_gro_offset(skb); 570 hlen = off_eth + sizeof(*eh); 571 eh = skb_gro_header_fast(skb, off_eth); 572 if (skb_gro_header_hard(skb, hlen)) { 573 eh = skb_gro_header_slow(skb, hlen, off_eth); 574 if (unlikely(!eh)) 575 goto out; 576 } 577 578 flush = 0; 579 580 for (p = *head; p; p = p->next) { 581 if (!NAPI_GRO_CB(p)->same_flow) 582 continue; 583 584 vh2 = (struct vxlanhdr *)(p->data + off_vx); 585 eh2 = (struct ethhdr *)(p->data + off_eth); 586 if (vh->vx_vni != vh2->vx_vni || compare_ether_header(eh, eh2)) { 587 NAPI_GRO_CB(p)->same_flow = 0; 588 continue; 589 } 590 } 591 592 type = eh->h_proto; 593 594 rcu_read_lock(); 595 ptype = gro_find_receive_by_type(type); 596 if (ptype == NULL) { 597 flush = 1; 598 goto out_unlock; 599 } 600 601 skb_gro_pull(skb, sizeof(*eh)); /* pull inner eth header */ 602 skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); 603 pp = ptype->callbacks.gro_receive(head, skb); 604 605out_unlock: 606 rcu_read_unlock(); 607out: 608 NAPI_GRO_CB(skb)->flush |= flush; 609 610 return pp; 611} 612 613static int vxlan_gro_complete(struct sk_buff *skb, int nhoff) 614{ 615 struct ethhdr *eh; 616 struct packet_offload *ptype; 617 __be16 type; 618 int vxlan_len = sizeof(struct vxlanhdr) + sizeof(struct ethhdr); 619 int err = -ENOSYS; 620 621 udp_tunnel_gro_complete(skb, nhoff); 622 623 eh = (struct ethhdr *)(skb->data + nhoff + sizeof(struct vxlanhdr)); 624 type = eh->h_proto; 625 626 rcu_read_lock(); 627 ptype = gro_find_complete_by_type(type); 628 if (ptype != NULL) 629 err = ptype->callbacks.gro_complete(skb, nhoff + vxlan_len); 630 631 rcu_read_unlock(); 632 return err; 633} 634 635/* Notify netdevs that UDP port started listening */ 636static void vxlan_notify_add_rx_port(struct vxlan_sock *vs) 637{ 638 struct net_device *dev; 639 struct sock *sk = vs->sock->sk; 640 struct net *net = sock_net(sk); 641 sa_family_t sa_family = sk->sk_family; 642 __be16 port = inet_sk(sk)->inet_sport; 643 int err; 644 645 if (sa_family == AF_INET) { 646 err = udp_add_offload(&vs->udp_offloads); 647 if (err) 648 pr_warn("vxlan: udp_add_offload failed with status %d\n", err); 649 } 650 651 rcu_read_lock(); 652 for_each_netdev_rcu(net, dev) { 653 if (dev->netdev_ops->ndo_add_vxlan_port) 654 dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, 655 port); 656 } 657 rcu_read_unlock(); 658} 659 660/* Notify netdevs that UDP port is no more listening */ 661static void vxlan_notify_del_rx_port(struct vxlan_sock *vs) 662{ 663 struct net_device *dev; 664 struct sock *sk = vs->sock->sk; 665 struct net *net = sock_net(sk); 666 sa_family_t sa_family = sk->sk_family; 667 __be16 port = inet_sk(sk)->inet_sport; 668 669 rcu_read_lock(); 670 for_each_netdev_rcu(net, dev) { 671 if (dev->netdev_ops->ndo_del_vxlan_port) 672 dev->netdev_ops->ndo_del_vxlan_port(dev, sa_family, 673 port); 674 } 675 rcu_read_unlock(); 676 677 if (sa_family == AF_INET) 678 udp_del_offload(&vs->udp_offloads); 679} 680 681/* Add new entry to forwarding table -- assumes lock held */ 682static int vxlan_fdb_create(struct vxlan_dev *vxlan, 683 const u8 *mac, union vxlan_addr *ip, 684 __u16 state, __u16 flags, 685 __be16 port, __u32 vni, __u32 ifindex, 686 __u8 ndm_flags) 687{ 688 struct vxlan_rdst *rd = NULL; 689 struct vxlan_fdb *f; 690 int notify = 0; 691 692 f = __vxlan_find_mac(vxlan, mac); 693 if (f) { 694 if (flags & NLM_F_EXCL) { 695 netdev_dbg(vxlan->dev, 696 "lost race to create %pM\n", mac); 697 return -EEXIST; 698 } 699 if (f->state != state) { 700 f->state = state; 701 f->updated = jiffies; 702 notify = 1; 703 } 704 if (f->flags != ndm_flags) { 705 f->flags = ndm_flags; 706 f->updated = jiffies; 707 notify = 1; 708 } 709 if ((flags & NLM_F_REPLACE)) { 710 /* Only change unicasts */ 711 if (!(is_multicast_ether_addr(f->eth_addr) || 712 is_zero_ether_addr(f->eth_addr))) { 713 int rc = vxlan_fdb_replace(f, ip, port, vni, 714 ifindex); 715 716 if (rc < 0) 717 return rc; 718 notify |= rc; 719 } else 720 return -EOPNOTSUPP; 721 } 722 if ((flags & NLM_F_APPEND) && 723 (is_multicast_ether_addr(f->eth_addr) || 724 is_zero_ether_addr(f->eth_addr))) { 725 int rc = vxlan_fdb_append(f, ip, port, vni, ifindex, 726 &rd); 727 728 if (rc < 0) 729 return rc; 730 notify |= rc; 731 } 732 } else { 733 if (!(flags & NLM_F_CREATE)) 734 return -ENOENT; 735 736 if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax) 737 return -ENOSPC; 738 739 /* Disallow replace to add a multicast entry */ 740 if ((flags & NLM_F_REPLACE) && 741 (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) 742 return -EOPNOTSUPP; 743 744 netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); 745 f = kmalloc(sizeof(*f), GFP_ATOMIC); 746 if (!f) 747 return -ENOMEM; 748 749 notify = 1; 750 f->state = state; 751 f->flags = ndm_flags; 752 f->updated = f->used = jiffies; 753 INIT_LIST_HEAD(&f->remotes); 754 memcpy(f->eth_addr, mac, ETH_ALEN); 755 756 vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); 757 758 ++vxlan->addrcnt; 759 hlist_add_head_rcu(&f->hlist, 760 vxlan_fdb_head(vxlan, mac)); 761 } 762 763 if (notify) { 764 if (rd == NULL) 765 rd = first_remote_rtnl(f); 766 vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH); 767 } 768 769 return 0; 770} 771 772static void vxlan_fdb_free(struct rcu_head *head) 773{ 774 struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); 775 struct vxlan_rdst *rd, *nd; 776 777 list_for_each_entry_safe(rd, nd, &f->remotes, list) 778 kfree(rd); 779 kfree(f); 780} 781 782static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) 783{ 784 netdev_dbg(vxlan->dev, 785 "delete %pM\n", f->eth_addr); 786 787 --vxlan->addrcnt; 788 vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH); 789 790 hlist_del_rcu(&f->hlist); 791 call_rcu(&f->rcu, vxlan_fdb_free); 792} 793 794static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, 795 union vxlan_addr *ip, __be16 *port, u32 *vni, u32 *ifindex) 796{ 797 struct net *net = dev_net(vxlan->dev); 798 int err; 799 800 if (tb[NDA_DST]) { 801 err = vxlan_nla_get_addr(ip, tb[NDA_DST]); 802 if (err) 803 return err; 804 } else { 805 union vxlan_addr *remote = &vxlan->default_dst.remote_ip; 806 if (remote->sa.sa_family == AF_INET) { 807 ip->sin.sin_addr.s_addr = htonl(INADDR_ANY); 808 ip->sa.sa_family = AF_INET; 809#if IS_ENABLED(CONFIG_IPV6) 810 } else { 811 ip->sin6.sin6_addr = in6addr_any; 812 ip->sa.sa_family = AF_INET6; 813#endif 814 } 815 } 816 817 if (tb[NDA_PORT]) { 818 if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) 819 return -EINVAL; 820 *port = nla_get_be16(tb[NDA_PORT]); 821 } else { 822 *port = vxlan->dst_port; 823 } 824 825 if (tb[NDA_VNI]) { 826 if (nla_len(tb[NDA_VNI]) != sizeof(u32)) 827 return -EINVAL; 828 *vni = nla_get_u32(tb[NDA_VNI]); 829 } else { 830 *vni = vxlan->default_dst.remote_vni; 831 } 832 833 if (tb[NDA_IFINDEX]) { 834 struct net_device *tdev; 835 836 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) 837 return -EINVAL; 838 *ifindex = nla_get_u32(tb[NDA_IFINDEX]); 839 tdev = __dev_get_by_index(net, *ifindex); 840 if (!tdev) 841 return -EADDRNOTAVAIL; 842 } else { 843 *ifindex = 0; 844 } 845 846 return 0; 847} 848 849/* Add static entry (via netlink) */ 850static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], 851 struct net_device *dev, 852 const unsigned char *addr, u16 flags) 853{ 854 struct vxlan_dev *vxlan = netdev_priv(dev); 855 /* struct net *net = dev_net(vxlan->dev); */ 856 union vxlan_addr ip; 857 __be16 port; 858 u32 vni, ifindex; 859 int err; 860 861 if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { 862 pr_info("RTM_NEWNEIGH with invalid state %#x\n", 863 ndm->ndm_state); 864 return -EINVAL; 865 } 866 867 if (tb[NDA_DST] == NULL) 868 return -EINVAL; 869 870 err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex); 871 if (err) 872 return err; 873 874 if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) 875 return -EAFNOSUPPORT; 876 877 spin_lock_bh(&vxlan->hash_lock); 878 err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags, 879 port, vni, ifindex, ndm->ndm_flags); 880 spin_unlock_bh(&vxlan->hash_lock); 881 882 return err; 883} 884 885/* Delete entry (via netlink) */ 886static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], 887 struct net_device *dev, 888 const unsigned char *addr) 889{ 890 struct vxlan_dev *vxlan = netdev_priv(dev); 891 struct vxlan_fdb *f; 892 struct vxlan_rdst *rd = NULL; 893 union vxlan_addr ip; 894 __be16 port; 895 u32 vni, ifindex; 896 int err; 897 898 err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex); 899 if (err) 900 return err; 901 902 err = -ENOENT; 903 904 spin_lock_bh(&vxlan->hash_lock); 905 f = vxlan_find_mac(vxlan, addr); 906 if (!f) 907 goto out; 908 909 if (!vxlan_addr_any(&ip)) { 910 rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex); 911 if (!rd) 912 goto out; 913 } 914 915 err = 0; 916 917 /* remove a destination if it's not the only one on the list, 918 * otherwise destroy the fdb entry 919 */ 920 if (rd && !list_is_singular(&f->remotes)) { 921 list_del_rcu(&rd->list); 922 vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH); 923 kfree_rcu(rd, rcu); 924 goto out; 925 } 926 927 vxlan_fdb_destroy(vxlan, f); 928 929out: 930 spin_unlock_bh(&vxlan->hash_lock); 931 932 return err; 933} 934 935/* Dump forwarding table */ 936static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, 937 struct net_device *dev, 938 struct net_device *filter_dev, int idx) 939{ 940 struct vxlan_dev *vxlan = netdev_priv(dev); 941 unsigned int h; 942 943 for (h = 0; h < FDB_HASH_SIZE; ++h) { 944 struct vxlan_fdb *f; 945 int err; 946 947 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { 948 struct vxlan_rdst *rd; 949 950 if (idx < cb->args[0]) 951 goto skip; 952 953 list_for_each_entry_rcu(rd, &f->remotes, list) { 954 err = vxlan_fdb_info(skb, vxlan, f, 955 NETLINK_CB(cb->skb).portid, 956 cb->nlh->nlmsg_seq, 957 RTM_NEWNEIGH, 958 NLM_F_MULTI, rd); 959 if (err < 0) 960 goto out; 961 } 962skip: 963 ++idx; 964 } 965 } 966out: 967 return idx; 968} 969 970/* Watch incoming packets to learn mapping between Ethernet address 971 * and Tunnel endpoint. 972 * Return true if packet is bogus and should be droppped. 973 */ 974static bool vxlan_snoop(struct net_device *dev, 975 union vxlan_addr *src_ip, const u8 *src_mac) 976{ 977 struct vxlan_dev *vxlan = netdev_priv(dev); 978 struct vxlan_fdb *f; 979 980 f = vxlan_find_mac(vxlan, src_mac); 981 if (likely(f)) { 982 struct vxlan_rdst *rdst = first_remote_rcu(f); 983 984 if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip))) 985 return false; 986 987 /* Don't migrate static entries, drop packets */ 988 if (f->state & NUD_NOARP) 989 return true; 990 991 if (net_ratelimit()) 992 netdev_info(dev, 993 "%pM migrated from %pIS to %pIS\n", 994 src_mac, &rdst->remote_ip, &src_ip); 995 996 rdst->remote_ip = *src_ip; 997 f->updated = jiffies; 998 vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH); 999 } else { 1000 /* learned new entry */ 1001 spin_lock(&vxlan->hash_lock); 1002 1003 /* close off race between vxlan_flush and incoming packets */ 1004 if (netif_running(dev)) 1005 vxlan_fdb_create(vxlan, src_mac, src_ip, 1006 NUD_REACHABLE, 1007 NLM_F_EXCL|NLM_F_CREATE, 1008 vxlan->dst_port, 1009 vxlan->default_dst.remote_vni, 1010 0, NTF_SELF); 1011 spin_unlock(&vxlan->hash_lock); 1012 } 1013 1014 return false; 1015} 1016 1017/* See if multicast group is already in use by other ID */ 1018static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) 1019{ 1020 struct vxlan_dev *vxlan; 1021 1022 /* The vxlan_sock is only used by dev, leaving group has 1023 * no effect on other vxlan devices. 1024 */ 1025 if (atomic_read(&dev->vn_sock->refcnt) == 1) 1026 return false; 1027 1028 list_for_each_entry(vxlan, &vn->vxlan_list, next) { 1029 if (!netif_running(vxlan->dev) || vxlan == dev) 1030 continue; 1031 1032 if (vxlan->vn_sock != dev->vn_sock) 1033 continue; 1034 1035 if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip, 1036 &dev->default_dst.remote_ip)) 1037 continue; 1038 1039 if (vxlan->default_dst.remote_ifindex != 1040 dev->default_dst.remote_ifindex) 1041 continue; 1042 1043 return true; 1044 } 1045 1046 return false; 1047} 1048 1049static void vxlan_sock_hold(struct vxlan_sock *vs) 1050{ 1051 atomic_inc(&vs->refcnt); 1052} 1053 1054void vxlan_sock_release(struct vxlan_sock *vs) 1055{ 1056 struct sock *sk = vs->sock->sk; 1057 struct net *net = sock_net(sk); 1058 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 1059 1060 if (!atomic_dec_and_test(&vs->refcnt)) 1061 return; 1062 1063 spin_lock(&vn->sock_lock); 1064 hlist_del_rcu(&vs->hlist); 1065 vxlan_notify_del_rx_port(vs); 1066 spin_unlock(&vn->sock_lock); 1067 1068 queue_work(vxlan_wq, &vs->del_work); 1069} 1070EXPORT_SYMBOL_GPL(vxlan_sock_release); 1071 1072/* Callback to update multicast group membership when first VNI on 1073 * multicast asddress is brought up 1074 * Done as workqueue because ip_mc_join_group acquires RTNL. 1075 */ 1076static void vxlan_igmp_join(struct work_struct *work) 1077{ 1078 struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join); 1079 struct vxlan_sock *vs = vxlan->vn_sock; 1080 struct sock *sk = vs->sock->sk; 1081 union vxlan_addr *ip = &vxlan->default_dst.remote_ip; 1082 int ifindex = vxlan->default_dst.remote_ifindex; 1083 1084 lock_sock(sk); 1085 if (ip->sa.sa_family == AF_INET) { 1086 struct ip_mreqn mreq = { 1087 .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, 1088 .imr_ifindex = ifindex, 1089 }; 1090 1091 ip_mc_join_group(sk, &mreq); 1092#if IS_ENABLED(CONFIG_IPV6) 1093 } else { 1094 ipv6_stub->ipv6_sock_mc_join(sk, ifindex, 1095 &ip->sin6.sin6_addr); 1096#endif 1097 } 1098 release_sock(sk); 1099 1100 vxlan_sock_release(vs); 1101 dev_put(vxlan->dev); 1102} 1103 1104/* Inverse of vxlan_igmp_join when last VNI is brought down */ 1105static void vxlan_igmp_leave(struct work_struct *work) 1106{ 1107 struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave); 1108 struct vxlan_sock *vs = vxlan->vn_sock; 1109 struct sock *sk = vs->sock->sk; 1110 union vxlan_addr *ip = &vxlan->default_dst.remote_ip; 1111 int ifindex = vxlan->default_dst.remote_ifindex; 1112 1113 lock_sock(sk); 1114 if (ip->sa.sa_family == AF_INET) { 1115 struct ip_mreqn mreq = { 1116 .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, 1117 .imr_ifindex = ifindex, 1118 }; 1119 1120 ip_mc_leave_group(sk, &mreq); 1121#if IS_ENABLED(CONFIG_IPV6) 1122 } else { 1123 ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, 1124 &ip->sin6.sin6_addr); 1125#endif 1126 } 1127 1128 release_sock(sk); 1129 1130 vxlan_sock_release(vs); 1131 dev_put(vxlan->dev); 1132} 1133 1134/* Callback from net/ipv4/udp.c to receive packets */ 1135static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) 1136{ 1137 struct vxlan_sock *vs; 1138 struct vxlanhdr *vxh; 1139 1140 /* Need Vxlan and inner Ethernet header to be present */ 1141 if (!pskb_may_pull(skb, VXLAN_HLEN)) 1142 goto error; 1143 1144 /* Return packets with reserved bits set */ 1145 vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); 1146 if (vxh->vx_flags != htonl(VXLAN_FLAGS) || 1147 (vxh->vx_vni & htonl(0xff))) { 1148 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", 1149 ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); 1150 goto error; 1151 } 1152 1153 if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) 1154 goto drop; 1155 1156 vs = rcu_dereference_sk_user_data(sk); 1157 if (!vs) 1158 goto drop; 1159 1160 vs->rcv(vs, skb, vxh->vx_vni); 1161 return 0; 1162 1163drop: 1164 /* Consume bad packet */ 1165 kfree_skb(skb); 1166 return 0; 1167 1168error: 1169 /* Return non vxlan pkt */ 1170 return 1; 1171} 1172 1173static void vxlan_rcv(struct vxlan_sock *vs, 1174 struct sk_buff *skb, __be32 vx_vni) 1175{ 1176 struct iphdr *oip = NULL; 1177 struct ipv6hdr *oip6 = NULL; 1178 struct vxlan_dev *vxlan; 1179 struct pcpu_sw_netstats *stats; 1180 union vxlan_addr saddr; 1181 __u32 vni; 1182 int err = 0; 1183 union vxlan_addr *remote_ip; 1184 1185 vni = ntohl(vx_vni) >> 8; 1186 /* Is this VNI defined? */ 1187 vxlan = vxlan_vs_find_vni(vs, vni); 1188 if (!vxlan) 1189 goto drop; 1190 1191 remote_ip = &vxlan->default_dst.remote_ip; 1192 skb_reset_mac_header(skb); 1193 skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); 1194 skb->protocol = eth_type_trans(skb, vxlan->dev); 1195 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1196 1197 /* Ignore packet loops (and multicast echo) */ 1198 if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) 1199 goto drop; 1200 1201 /* Re-examine inner Ethernet packet */ 1202 if (remote_ip->sa.sa_family == AF_INET) { 1203 oip = ip_hdr(skb); 1204 saddr.sin.sin_addr.s_addr = oip->saddr; 1205 saddr.sa.sa_family = AF_INET; 1206#if IS_ENABLED(CONFIG_IPV6) 1207 } else { 1208 oip6 = ipv6_hdr(skb); 1209 saddr.sin6.sin6_addr = oip6->saddr; 1210 saddr.sa.sa_family = AF_INET6; 1211#endif 1212 } 1213 1214 if ((vxlan->flags & VXLAN_F_LEARN) && 1215 vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) 1216 goto drop; 1217 1218 skb_reset_network_header(skb); 1219 1220 if (oip6) 1221 err = IP6_ECN_decapsulate(oip6, skb); 1222 if (oip) 1223 err = IP_ECN_decapsulate(oip, skb); 1224 1225 if (unlikely(err)) { 1226 if (log_ecn_error) { 1227 if (oip6) 1228 net_info_ratelimited("non-ECT from %pI6\n", 1229 &oip6->saddr); 1230 if (oip) 1231 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 1232 &oip->saddr, oip->tos); 1233 } 1234 if (err > 1) { 1235 ++vxlan->dev->stats.rx_frame_errors; 1236 ++vxlan->dev->stats.rx_errors; 1237 goto drop; 1238 } 1239 } 1240 1241 stats = this_cpu_ptr(vxlan->dev->tstats); 1242 u64_stats_update_begin(&stats->syncp); 1243 stats->rx_packets++; 1244 stats->rx_bytes += skb->len; 1245 u64_stats_update_end(&stats->syncp); 1246 1247 netif_rx(skb); 1248 1249 return; 1250drop: 1251 /* Consume bad packet */ 1252 kfree_skb(skb); 1253} 1254 1255static int arp_reduce(struct net_device *dev, struct sk_buff *skb) 1256{ 1257 struct vxlan_dev *vxlan = netdev_priv(dev); 1258 struct arphdr *parp; 1259 u8 *arpptr, *sha; 1260 __be32 sip, tip; 1261 struct neighbour *n; 1262 1263 if (dev->flags & IFF_NOARP) 1264 goto out; 1265 1266 if (!pskb_may_pull(skb, arp_hdr_len(dev))) { 1267 dev->stats.tx_dropped++; 1268 goto out; 1269 } 1270 parp = arp_hdr(skb); 1271 1272 if ((parp->ar_hrd != htons(ARPHRD_ETHER) && 1273 parp->ar_hrd != htons(ARPHRD_IEEE802)) || 1274 parp->ar_pro != htons(ETH_P_IP) || 1275 parp->ar_op != htons(ARPOP_REQUEST) || 1276 parp->ar_hln != dev->addr_len || 1277 parp->ar_pln != 4) 1278 goto out; 1279 arpptr = (u8 *)parp + sizeof(struct arphdr); 1280 sha = arpptr; 1281 arpptr += dev->addr_len; /* sha */ 1282 memcpy(&sip, arpptr, sizeof(sip)); 1283 arpptr += sizeof(sip); 1284 arpptr += dev->addr_len; /* tha */ 1285 memcpy(&tip, arpptr, sizeof(tip)); 1286 1287 if (ipv4_is_loopback(tip) || 1288 ipv4_is_multicast(tip)) 1289 goto out; 1290 1291 n = neigh_lookup(&arp_tbl, &tip, dev); 1292 1293 if (n) { 1294 struct vxlan_fdb *f; 1295 struct sk_buff *reply; 1296 1297 if (!(n->nud_state & NUD_CONNECTED)) { 1298 neigh_release(n); 1299 goto out; 1300 } 1301 1302 f = vxlan_find_mac(vxlan, n->ha); 1303 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { 1304 /* bridge-local neighbor */ 1305 neigh_release(n); 1306 goto out; 1307 } 1308 1309 reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, 1310 n->ha, sha); 1311 1312 neigh_release(n); 1313 1314 if (reply == NULL) 1315 goto out; 1316 1317 skb_reset_mac_header(reply); 1318 __skb_pull(reply, skb_network_offset(reply)); 1319 reply->ip_summed = CHECKSUM_UNNECESSARY; 1320 reply->pkt_type = PACKET_HOST; 1321 1322 if (netif_rx_ni(reply) == NET_RX_DROP) 1323 dev->stats.rx_dropped++; 1324 } else if (vxlan->flags & VXLAN_F_L3MISS) { 1325 union vxlan_addr ipa = { 1326 .sin.sin_addr.s_addr = tip, 1327 .sin.sin_family = AF_INET, 1328 }; 1329 1330 vxlan_ip_miss(dev, &ipa); 1331 } 1332out: 1333 consume_skb(skb); 1334 return NETDEV_TX_OK; 1335} 1336 1337#if IS_ENABLED(CONFIG_IPV6) 1338static struct sk_buff *vxlan_na_create(struct sk_buff *request, 1339 struct neighbour *n, bool isrouter) 1340{ 1341 struct net_device *dev = request->dev; 1342 struct sk_buff *reply; 1343 struct nd_msg *ns, *na; 1344 struct ipv6hdr *pip6; 1345 u8 *daddr; 1346 int na_olen = 8; /* opt hdr + ETH_ALEN for target */ 1347 int ns_olen; 1348 int i, len; 1349 1350 if (dev == NULL) 1351 return NULL; 1352 1353 len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + 1354 sizeof(*na) + na_olen + dev->needed_tailroom; 1355 reply = alloc_skb(len, GFP_ATOMIC); 1356 if (reply == NULL) 1357 return NULL; 1358 1359 reply->protocol = htons(ETH_P_IPV6); 1360 reply->dev = dev; 1361 skb_reserve(reply, LL_RESERVED_SPACE(request->dev)); 1362 skb_push(reply, sizeof(struct ethhdr)); 1363 skb_set_mac_header(reply, 0); 1364 1365 ns = (struct nd_msg *)skb_transport_header(request); 1366 1367 daddr = eth_hdr(request)->h_source; 1368 ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns); 1369 for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { 1370 if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { 1371 daddr = ns->opt + i + sizeof(struct nd_opt_hdr); 1372 break; 1373 } 1374 } 1375 1376 /* Ethernet header */ 1377 ether_addr_copy(eth_hdr(reply)->h_dest, daddr); 1378 ether_addr_copy(eth_hdr(reply)->h_source, n->ha); 1379 eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); 1380 reply->protocol = htons(ETH_P_IPV6); 1381 1382 skb_pull(reply, sizeof(struct ethhdr)); 1383 skb_set_network_header(reply, 0); 1384 skb_put(reply, sizeof(struct ipv6hdr)); 1385 1386 /* IPv6 header */ 1387 1388 pip6 = ipv6_hdr(reply); 1389 memset(pip6, 0, sizeof(struct ipv6hdr)); 1390 pip6->version = 6; 1391 pip6->priority = ipv6_hdr(request)->priority; 1392 pip6->nexthdr = IPPROTO_ICMPV6; 1393 pip6->hop_limit = 255; 1394 pip6->daddr = ipv6_hdr(request)->saddr; 1395 pip6->saddr = *(struct in6_addr *)n->primary_key; 1396 1397 skb_pull(reply, sizeof(struct ipv6hdr)); 1398 skb_set_transport_header(reply, 0); 1399 1400 na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen); 1401 1402 /* Neighbor Advertisement */ 1403 memset(na, 0, sizeof(*na)+na_olen); 1404 na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; 1405 na->icmph.icmp6_router = isrouter; 1406 na->icmph.icmp6_override = 1; 1407 na->icmph.icmp6_solicited = 1; 1408 na->target = ns->target; 1409 ether_addr_copy(&na->opt[2], n->ha); 1410 na->opt[0] = ND_OPT_TARGET_LL_ADDR; 1411 na->opt[1] = na_olen >> 3; 1412 1413 na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, 1414 &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6, 1415 csum_partial(na, sizeof(*na)+na_olen, 0)); 1416 1417 pip6->payload_len = htons(sizeof(*na)+na_olen); 1418 1419 skb_push(reply, sizeof(struct ipv6hdr)); 1420 1421 reply->ip_summed = CHECKSUM_UNNECESSARY; 1422 1423 return reply; 1424} 1425 1426static int neigh_reduce(struct net_device *dev, struct sk_buff *skb) 1427{ 1428 struct vxlan_dev *vxlan = netdev_priv(dev); 1429 struct nd_msg *msg; 1430 const struct ipv6hdr *iphdr; 1431 const struct in6_addr *saddr, *daddr; 1432 struct neighbour *n; 1433 struct inet6_dev *in6_dev; 1434 1435 in6_dev = __in6_dev_get(dev); 1436 if (!in6_dev) 1437 goto out; 1438 1439 iphdr = ipv6_hdr(skb); 1440 saddr = &iphdr->saddr; 1441 daddr = &iphdr->daddr; 1442 1443 msg = (struct nd_msg *)skb_transport_header(skb); 1444 if (msg->icmph.icmp6_code != 0 || 1445 msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) 1446 goto out; 1447 1448 if (ipv6_addr_loopback(daddr) || 1449 ipv6_addr_is_multicast(&msg->target)) 1450 goto out; 1451 1452 n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev); 1453 1454 if (n) { 1455 struct vxlan_fdb *f; 1456 struct sk_buff *reply; 1457 1458 if (!(n->nud_state & NUD_CONNECTED)) { 1459 neigh_release(n); 1460 goto out; 1461 } 1462 1463 f = vxlan_find_mac(vxlan, n->ha); 1464 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { 1465 /* bridge-local neighbor */ 1466 neigh_release(n); 1467 goto out; 1468 } 1469 1470 reply = vxlan_na_create(skb, n, 1471 !!(f ? f->flags & NTF_ROUTER : 0)); 1472 1473 neigh_release(n); 1474 1475 if (reply == NULL) 1476 goto out; 1477 1478 if (netif_rx_ni(reply) == NET_RX_DROP) 1479 dev->stats.rx_dropped++; 1480 1481 } else if (vxlan->flags & VXLAN_F_L3MISS) { 1482 union vxlan_addr ipa = { 1483 .sin6.sin6_addr = msg->target, 1484 .sin6.sin6_family = AF_INET6, 1485 }; 1486 1487 vxlan_ip_miss(dev, &ipa); 1488 } 1489 1490out: 1491 consume_skb(skb); 1492 return NETDEV_TX_OK; 1493} 1494#endif 1495 1496static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) 1497{ 1498 struct vxlan_dev *vxlan = netdev_priv(dev); 1499 struct neighbour *n; 1500 1501 if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) 1502 return false; 1503 1504 n = NULL; 1505 switch (ntohs(eth_hdr(skb)->h_proto)) { 1506 case ETH_P_IP: 1507 { 1508 struct iphdr *pip; 1509 1510 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 1511 return false; 1512 pip = ip_hdr(skb); 1513 n = neigh_lookup(&arp_tbl, &pip->daddr, dev); 1514 if (!n && (vxlan->flags & VXLAN_F_L3MISS)) { 1515 union vxlan_addr ipa = { 1516 .sin.sin_addr.s_addr = pip->daddr, 1517 .sin.sin_family = AF_INET, 1518 }; 1519 1520 vxlan_ip_miss(dev, &ipa); 1521 return false; 1522 } 1523 1524 break; 1525 } 1526#if IS_ENABLED(CONFIG_IPV6) 1527 case ETH_P_IPV6: 1528 { 1529 struct ipv6hdr *pip6; 1530 1531 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 1532 return false; 1533 pip6 = ipv6_hdr(skb); 1534 n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev); 1535 if (!n && (vxlan->flags & VXLAN_F_L3MISS)) { 1536 union vxlan_addr ipa = { 1537 .sin6.sin6_addr = pip6->daddr, 1538 .sin6.sin6_family = AF_INET6, 1539 }; 1540 1541 vxlan_ip_miss(dev, &ipa); 1542 return false; 1543 } 1544 1545 break; 1546 } 1547#endif 1548 default: 1549 return false; 1550 } 1551 1552 if (n) { 1553 bool diff; 1554 1555 diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha); 1556 if (diff) { 1557 memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 1558 dev->addr_len); 1559 memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len); 1560 } 1561 neigh_release(n); 1562 return diff; 1563 } 1564 1565 return false; 1566} 1567 1568#if IS_ENABLED(CONFIG_IPV6) 1569static int vxlan6_xmit_skb(struct vxlan_sock *vs, 1570 struct dst_entry *dst, struct sk_buff *skb, 1571 struct net_device *dev, struct in6_addr *saddr, 1572 struct in6_addr *daddr, __u8 prio, __u8 ttl, 1573 __be16 src_port, __be16 dst_port, __be32 vni, 1574 bool xnet) 1575{ 1576 struct vxlanhdr *vxh; 1577 int min_headroom; 1578 int err; 1579 bool udp_sum = !udp_get_no_check6_tx(vs->sock->sk); 1580 1581 skb = udp_tunnel_handle_offloads(skb, udp_sum); 1582 if (IS_ERR(skb)) 1583 return -EINVAL; 1584 1585 skb_scrub_packet(skb, xnet); 1586 1587 min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len 1588 + VXLAN_HLEN + sizeof(struct ipv6hdr) 1589 + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); 1590 1591 /* Need space for new headers (invalidates iph ptr) */ 1592 err = skb_cow_head(skb, min_headroom); 1593 if (unlikely(err)) 1594 return err; 1595 1596 if (vlan_tx_tag_present(skb)) { 1597 if (WARN_ON(!__vlan_put_tag(skb, 1598 skb->vlan_proto, 1599 vlan_tx_tag_get(skb)))) 1600 return -ENOMEM; 1601 1602 skb->vlan_tci = 0; 1603 } 1604 1605 vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); 1606 vxh->vx_flags = htonl(VXLAN_FLAGS); 1607 vxh->vx_vni = vni; 1608 1609 skb_set_inner_protocol(skb, htons(ETH_P_TEB)); 1610 1611 udp_tunnel6_xmit_skb(vs->sock, dst, skb, dev, saddr, daddr, prio, 1612 ttl, src_port, dst_port); 1613 return 0; 1614} 1615#endif 1616 1617int vxlan_xmit_skb(struct vxlan_sock *vs, 1618 struct rtable *rt, struct sk_buff *skb, 1619 __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, 1620 __be16 src_port, __be16 dst_port, __be32 vni, bool xnet) 1621{ 1622 struct vxlanhdr *vxh; 1623 int min_headroom; 1624 int err; 1625 bool udp_sum = !vs->sock->sk->sk_no_check_tx; 1626 1627 skb = udp_tunnel_handle_offloads(skb, udp_sum); 1628 if (IS_ERR(skb)) 1629 return -EINVAL; 1630 1631 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len 1632 + VXLAN_HLEN + sizeof(struct iphdr) 1633 + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); 1634 1635 /* Need space for new headers (invalidates iph ptr) */ 1636 err = skb_cow_head(skb, min_headroom); 1637 if (unlikely(err)) 1638 return err; 1639 1640 if (vlan_tx_tag_present(skb)) { 1641 if (WARN_ON(!__vlan_put_tag(skb, 1642 skb->vlan_proto, 1643 vlan_tx_tag_get(skb)))) 1644 return -ENOMEM; 1645 1646 skb->vlan_tci = 0; 1647 } 1648 1649 vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); 1650 vxh->vx_flags = htonl(VXLAN_FLAGS); 1651 vxh->vx_vni = vni; 1652 1653 skb_set_inner_protocol(skb, htons(ETH_P_TEB)); 1654 1655 return udp_tunnel_xmit_skb(vs->sock, rt, skb, src, dst, tos, 1656 ttl, df, src_port, dst_port, xnet); 1657} 1658EXPORT_SYMBOL_GPL(vxlan_xmit_skb); 1659 1660/* Bypass encapsulation if the destination is local */ 1661static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, 1662 struct vxlan_dev *dst_vxlan) 1663{ 1664 struct pcpu_sw_netstats *tx_stats, *rx_stats; 1665 union vxlan_addr loopback; 1666 union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; 1667 struct net_device *dev = skb->dev; 1668 int len = skb->len; 1669 1670 tx_stats = this_cpu_ptr(src_vxlan->dev->tstats); 1671 rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats); 1672 skb->pkt_type = PACKET_HOST; 1673 skb->encapsulation = 0; 1674 skb->dev = dst_vxlan->dev; 1675 __skb_pull(skb, skb_network_offset(skb)); 1676 1677 if (remote_ip->sa.sa_family == AF_INET) { 1678 loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); 1679 loopback.sa.sa_family = AF_INET; 1680#if IS_ENABLED(CONFIG_IPV6) 1681 } else { 1682 loopback.sin6.sin6_addr = in6addr_loopback; 1683 loopback.sa.sa_family = AF_INET6; 1684#endif 1685 } 1686 1687 if (dst_vxlan->flags & VXLAN_F_LEARN) 1688 vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source); 1689 1690 u64_stats_update_begin(&tx_stats->syncp); 1691 tx_stats->tx_packets++; 1692 tx_stats->tx_bytes += len; 1693 u64_stats_update_end(&tx_stats->syncp); 1694 1695 if (netif_rx(skb) == NET_RX_SUCCESS) { 1696 u64_stats_update_begin(&rx_stats->syncp); 1697 rx_stats->rx_packets++; 1698 rx_stats->rx_bytes += len; 1699 u64_stats_update_end(&rx_stats->syncp); 1700 } else { 1701 dev->stats.rx_dropped++; 1702 } 1703} 1704 1705static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, 1706 struct vxlan_rdst *rdst, bool did_rsc) 1707{ 1708 struct vxlan_dev *vxlan = netdev_priv(dev); 1709 struct rtable *rt = NULL; 1710 const struct iphdr *old_iph; 1711 struct flowi4 fl4; 1712 union vxlan_addr *dst; 1713 __be16 src_port = 0, dst_port; 1714 u32 vni; 1715 __be16 df = 0; 1716 __u8 tos, ttl; 1717 int err; 1718 1719 dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; 1720 vni = rdst->remote_vni; 1721 dst = &rdst->remote_ip; 1722 1723 if (vxlan_addr_any(dst)) { 1724 if (did_rsc) { 1725 /* short-circuited back to local bridge */ 1726 vxlan_encap_bypass(skb, vxlan, vxlan); 1727 return; 1728 } 1729 goto drop; 1730 } 1731 1732 old_iph = ip_hdr(skb); 1733 1734 ttl = vxlan->ttl; 1735 if (!ttl && vxlan_addr_multicast(dst)) 1736 ttl = 1; 1737 1738 tos = vxlan->tos; 1739 if (tos == 1) 1740 tos = ip_tunnel_get_dsfield(old_iph, skb); 1741 1742 src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->port_min, 1743 vxlan->port_max, true); 1744 1745 if (dst->sa.sa_family == AF_INET) { 1746 memset(&fl4, 0, sizeof(fl4)); 1747 fl4.flowi4_oif = rdst->remote_ifindex; 1748 fl4.flowi4_tos = RT_TOS(tos); 1749 fl4.daddr = dst->sin.sin_addr.s_addr; 1750 fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr; 1751 1752 rt = ip_route_output_key(vxlan->net, &fl4); 1753 if (IS_ERR(rt)) { 1754 netdev_dbg(dev, "no route to %pI4\n", 1755 &dst->sin.sin_addr.s_addr); 1756 dev->stats.tx_carrier_errors++; 1757 goto tx_error; 1758 } 1759 1760 if (rt->dst.dev == dev) { 1761 netdev_dbg(dev, "circular route to %pI4\n", 1762 &dst->sin.sin_addr.s_addr); 1763 dev->stats.collisions++; 1764 goto rt_tx_error; 1765 } 1766 1767 /* Bypass encapsulation if the destination is local */ 1768 if (rt->rt_flags & RTCF_LOCAL && 1769 !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { 1770 struct vxlan_dev *dst_vxlan; 1771 1772 ip_rt_put(rt); 1773 dst_vxlan = vxlan_find_vni(vxlan->net, vni, 1774 dst->sa.sa_family, dst_port); 1775 if (!dst_vxlan) 1776 goto tx_error; 1777 vxlan_encap_bypass(skb, vxlan, dst_vxlan); 1778 return; 1779 } 1780 1781 tos = ip_tunnel_ecn_encap(tos, old_iph, skb); 1782 ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); 1783 1784 err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb, 1785 fl4.saddr, dst->sin.sin_addr.s_addr, 1786 tos, ttl, df, src_port, dst_port, 1787 htonl(vni << 8), 1788 !net_eq(vxlan->net, dev_net(vxlan->dev))); 1789 1790 if (err < 0) 1791 goto rt_tx_error; 1792 iptunnel_xmit_stats(err, &dev->stats, dev->tstats); 1793#if IS_ENABLED(CONFIG_IPV6) 1794 } else { 1795 struct sock *sk = vxlan->vn_sock->sock->sk; 1796 struct dst_entry *ndst; 1797 struct flowi6 fl6; 1798 u32 flags; 1799 1800 memset(&fl6, 0, sizeof(fl6)); 1801 fl6.flowi6_oif = rdst->remote_ifindex; 1802 fl6.daddr = dst->sin6.sin6_addr; 1803 fl6.saddr = vxlan->saddr.sin6.sin6_addr; 1804 fl6.flowi6_proto = IPPROTO_UDP; 1805 1806 if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) { 1807 netdev_dbg(dev, "no route to %pI6\n", 1808 &dst->sin6.sin6_addr); 1809 dev->stats.tx_carrier_errors++; 1810 goto tx_error; 1811 } 1812 1813 if (ndst->dev == dev) { 1814 netdev_dbg(dev, "circular route to %pI6\n", 1815 &dst->sin6.sin6_addr); 1816 dst_release(ndst); 1817 dev->stats.collisions++; 1818 goto tx_error; 1819 } 1820 1821 /* Bypass encapsulation if the destination is local */ 1822 flags = ((struct rt6_info *)ndst)->rt6i_flags; 1823 if (flags & RTF_LOCAL && 1824 !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { 1825 struct vxlan_dev *dst_vxlan; 1826 1827 dst_release(ndst); 1828 dst_vxlan = vxlan_find_vni(vxlan->net, vni, 1829 dst->sa.sa_family, dst_port); 1830 if (!dst_vxlan) 1831 goto tx_error; 1832 vxlan_encap_bypass(skb, vxlan, dst_vxlan); 1833 return; 1834 } 1835 1836 ttl = ttl ? : ip6_dst_hoplimit(ndst); 1837 1838 err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb, 1839 dev, &fl6.saddr, &fl6.daddr, 0, ttl, 1840 src_port, dst_port, htonl(vni << 8), 1841 !net_eq(vxlan->net, dev_net(vxlan->dev))); 1842#endif 1843 } 1844 1845 return; 1846 1847drop: 1848 dev->stats.tx_dropped++; 1849 goto tx_free; 1850 1851rt_tx_error: 1852 ip_rt_put(rt); 1853tx_error: 1854 dev->stats.tx_errors++; 1855tx_free: 1856 dev_kfree_skb(skb); 1857} 1858 1859/* Transmit local packets over Vxlan 1860 * 1861 * Outer IP header inherits ECN and DF from inner header. 1862 * Outer UDP destination is the VXLAN assigned port. 1863 * source port is based on hash of flow 1864 */ 1865static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) 1866{ 1867 struct vxlan_dev *vxlan = netdev_priv(dev); 1868 struct ethhdr *eth; 1869 bool did_rsc = false; 1870 struct vxlan_rdst *rdst, *fdst = NULL; 1871 struct vxlan_fdb *f; 1872 1873 skb_reset_mac_header(skb); 1874 eth = eth_hdr(skb); 1875 1876 if ((vxlan->flags & VXLAN_F_PROXY)) { 1877 if (ntohs(eth->h_proto) == ETH_P_ARP) 1878 return arp_reduce(dev, skb); 1879#if IS_ENABLED(CONFIG_IPV6) 1880 else if (ntohs(eth->h_proto) == ETH_P_IPV6 && 1881 pskb_may_pull(skb, sizeof(struct ipv6hdr) 1882 + sizeof(struct nd_msg)) && 1883 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { 1884 struct nd_msg *msg; 1885 1886 msg = (struct nd_msg *)skb_transport_header(skb); 1887 if (msg->icmph.icmp6_code == 0 && 1888 msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) 1889 return neigh_reduce(dev, skb); 1890 } 1891 eth = eth_hdr(skb); 1892#endif 1893 } 1894 1895 f = vxlan_find_mac(vxlan, eth->h_dest); 1896 did_rsc = false; 1897 1898 if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) && 1899 (ntohs(eth->h_proto) == ETH_P_IP || 1900 ntohs(eth->h_proto) == ETH_P_IPV6)) { 1901 did_rsc = route_shortcircuit(dev, skb); 1902 if (did_rsc) 1903 f = vxlan_find_mac(vxlan, eth->h_dest); 1904 } 1905 1906 if (f == NULL) { 1907 f = vxlan_find_mac(vxlan, all_zeros_mac); 1908 if (f == NULL) { 1909 if ((vxlan->flags & VXLAN_F_L2MISS) && 1910 !is_multicast_ether_addr(eth->h_dest)) 1911 vxlan_fdb_miss(vxlan, eth->h_dest); 1912 1913 dev->stats.tx_dropped++; 1914 kfree_skb(skb); 1915 return NETDEV_TX_OK; 1916 } 1917 } 1918 1919 list_for_each_entry_rcu(rdst, &f->remotes, list) { 1920 struct sk_buff *skb1; 1921 1922 if (!fdst) { 1923 fdst = rdst; 1924 continue; 1925 } 1926 skb1 = skb_clone(skb, GFP_ATOMIC); 1927 if (skb1) 1928 vxlan_xmit_one(skb1, dev, rdst, did_rsc); 1929 } 1930 1931 if (fdst) 1932 vxlan_xmit_one(skb, dev, fdst, did_rsc); 1933 else 1934 kfree_skb(skb); 1935 return NETDEV_TX_OK; 1936} 1937 1938/* Walk the forwarding table and purge stale entries */ 1939static void vxlan_cleanup(unsigned long arg) 1940{ 1941 struct vxlan_dev *vxlan = (struct vxlan_dev *) arg; 1942 unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; 1943 unsigned int h; 1944 1945 if (!netif_running(vxlan->dev)) 1946 return; 1947 1948 spin_lock_bh(&vxlan->hash_lock); 1949 for (h = 0; h < FDB_HASH_SIZE; ++h) { 1950 struct hlist_node *p, *n; 1951 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { 1952 struct vxlan_fdb *f 1953 = container_of(p, struct vxlan_fdb, hlist); 1954 unsigned long timeout; 1955 1956 if (f->state & NUD_PERMANENT) 1957 continue; 1958 1959 timeout = f->used + vxlan->age_interval * HZ; 1960 if (time_before_eq(timeout, jiffies)) { 1961 netdev_dbg(vxlan->dev, 1962 "garbage collect %pM\n", 1963 f->eth_addr); 1964 f->state = NUD_STALE; 1965 vxlan_fdb_destroy(vxlan, f); 1966 } else if (time_before(timeout, next_timer)) 1967 next_timer = timeout; 1968 } 1969 } 1970 spin_unlock_bh(&vxlan->hash_lock); 1971 1972 mod_timer(&vxlan->age_timer, next_timer); 1973} 1974 1975static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) 1976{ 1977 __u32 vni = vxlan->default_dst.remote_vni; 1978 1979 vxlan->vn_sock = vs; 1980 hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); 1981} 1982 1983/* Setup stats when device is created */ 1984static int vxlan_init(struct net_device *dev) 1985{ 1986 struct vxlan_dev *vxlan = netdev_priv(dev); 1987 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 1988 struct vxlan_sock *vs; 1989 bool ipv6 = vxlan->flags & VXLAN_F_IPV6; 1990 1991 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 1992 if (!dev->tstats) 1993 return -ENOMEM; 1994 1995 spin_lock(&vn->sock_lock); 1996 vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, 1997 vxlan->dst_port); 1998 if (vs) { 1999 /* If we have a socket with same port already, reuse it */ 2000 atomic_inc(&vs->refcnt); 2001 vxlan_vs_add_dev(vs, vxlan); 2002 } else { 2003 /* otherwise make new socket outside of RTNL */ 2004 dev_hold(dev); 2005 queue_work(vxlan_wq, &vxlan->sock_work); 2006 } 2007 spin_unlock(&vn->sock_lock); 2008 2009 return 0; 2010} 2011 2012static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan) 2013{ 2014 struct vxlan_fdb *f; 2015 2016 spin_lock_bh(&vxlan->hash_lock); 2017 f = __vxlan_find_mac(vxlan, all_zeros_mac); 2018 if (f) 2019 vxlan_fdb_destroy(vxlan, f); 2020 spin_unlock_bh(&vxlan->hash_lock); 2021} 2022 2023static void vxlan_uninit(struct net_device *dev) 2024{ 2025 struct vxlan_dev *vxlan = netdev_priv(dev); 2026 struct vxlan_sock *vs = vxlan->vn_sock; 2027 2028 vxlan_fdb_delete_default(vxlan); 2029 2030 if (vs) 2031 vxlan_sock_release(vs); 2032 free_percpu(dev->tstats); 2033} 2034 2035/* Start ageing timer and join group when device is brought up */ 2036static int vxlan_open(struct net_device *dev) 2037{ 2038 struct vxlan_dev *vxlan = netdev_priv(dev); 2039 struct vxlan_sock *vs = vxlan->vn_sock; 2040 2041 /* socket hasn't been created */ 2042 if (!vs) 2043 return -ENOTCONN; 2044 2045 if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { 2046 vxlan_sock_hold(vs); 2047 dev_hold(dev); 2048 queue_work(vxlan_wq, &vxlan->igmp_join); 2049 } 2050 2051 if (vxlan->age_interval) 2052 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); 2053 2054 return 0; 2055} 2056 2057/* Purge the forwarding table */ 2058static void vxlan_flush(struct vxlan_dev *vxlan) 2059{ 2060 unsigned int h; 2061 2062 spin_lock_bh(&vxlan->hash_lock); 2063 for (h = 0; h < FDB_HASH_SIZE; ++h) { 2064 struct hlist_node *p, *n; 2065 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { 2066 struct vxlan_fdb *f 2067 = container_of(p, struct vxlan_fdb, hlist); 2068 /* the all_zeros_mac entry is deleted at vxlan_uninit */ 2069 if (!is_zero_ether_addr(f->eth_addr)) 2070 vxlan_fdb_destroy(vxlan, f); 2071 } 2072 } 2073 spin_unlock_bh(&vxlan->hash_lock); 2074} 2075 2076/* Cleanup timer and forwarding table on shutdown */ 2077static int vxlan_stop(struct net_device *dev) 2078{ 2079 struct vxlan_dev *vxlan = netdev_priv(dev); 2080 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2081 struct vxlan_sock *vs = vxlan->vn_sock; 2082 2083 if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && 2084 !vxlan_group_used(vn, vxlan)) { 2085 vxlan_sock_hold(vs); 2086 dev_hold(dev); 2087 queue_work(vxlan_wq, &vxlan->igmp_leave); 2088 } 2089 2090 del_timer_sync(&vxlan->age_timer); 2091 2092 vxlan_flush(vxlan); 2093 2094 return 0; 2095} 2096 2097/* Stub, nothing needs to be done. */ 2098static void vxlan_set_multicast_list(struct net_device *dev) 2099{ 2100} 2101 2102static int vxlan_change_mtu(struct net_device *dev, int new_mtu) 2103{ 2104 struct vxlan_dev *vxlan = netdev_priv(dev); 2105 struct vxlan_rdst *dst = &vxlan->default_dst; 2106 struct net_device *lowerdev; 2107 int max_mtu; 2108 2109 lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); 2110 if (lowerdev == NULL) 2111 return eth_change_mtu(dev, new_mtu); 2112 2113 if (dst->remote_ip.sa.sa_family == AF_INET6) 2114 max_mtu = lowerdev->mtu - VXLAN6_HEADROOM; 2115 else 2116 max_mtu = lowerdev->mtu - VXLAN_HEADROOM; 2117 2118 if (new_mtu < 68 || new_mtu > max_mtu) 2119 return -EINVAL; 2120 2121 dev->mtu = new_mtu; 2122 return 0; 2123} 2124 2125static const struct net_device_ops vxlan_netdev_ops = { 2126 .ndo_init = vxlan_init, 2127 .ndo_uninit = vxlan_uninit, 2128 .ndo_open = vxlan_open, 2129 .ndo_stop = vxlan_stop, 2130 .ndo_start_xmit = vxlan_xmit, 2131 .ndo_get_stats64 = ip_tunnel_get_stats64, 2132 .ndo_set_rx_mode = vxlan_set_multicast_list, 2133 .ndo_change_mtu = vxlan_change_mtu, 2134 .ndo_validate_addr = eth_validate_addr, 2135 .ndo_set_mac_address = eth_mac_addr, 2136 .ndo_fdb_add = vxlan_fdb_add, 2137 .ndo_fdb_del = vxlan_fdb_delete, 2138 .ndo_fdb_dump = vxlan_fdb_dump, 2139}; 2140 2141/* Info for udev, that this is a virtual tunnel endpoint */ 2142static struct device_type vxlan_type = { 2143 .name = "vxlan", 2144}; 2145 2146/* Calls the ndo_add_vxlan_port of the caller in order to 2147 * supply the listening VXLAN udp ports. Callers are expected 2148 * to implement the ndo_add_vxlan_port. 2149 */ 2150void vxlan_get_rx_port(struct net_device *dev) 2151{ 2152 struct vxlan_sock *vs; 2153 struct net *net = dev_net(dev); 2154 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2155 sa_family_t sa_family; 2156 __be16 port; 2157 unsigned int i; 2158 2159 spin_lock(&vn->sock_lock); 2160 for (i = 0; i < PORT_HASH_SIZE; ++i) { 2161 hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { 2162 port = inet_sk(vs->sock->sk)->inet_sport; 2163 sa_family = vs->sock->sk->sk_family; 2164 dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, 2165 port); 2166 } 2167 } 2168 spin_unlock(&vn->sock_lock); 2169} 2170EXPORT_SYMBOL_GPL(vxlan_get_rx_port); 2171 2172/* Initialize the device structure. */ 2173static void vxlan_setup(struct net_device *dev) 2174{ 2175 struct vxlan_dev *vxlan = netdev_priv(dev); 2176 unsigned int h; 2177 2178 eth_hw_addr_random(dev); 2179 ether_setup(dev); 2180 if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6) 2181 dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM; 2182 else 2183 dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM; 2184 2185 dev->netdev_ops = &vxlan_netdev_ops; 2186 dev->destructor = free_netdev; 2187 SET_NETDEV_DEVTYPE(dev, &vxlan_type); 2188 2189 dev->tx_queue_len = 0; 2190 dev->features |= NETIF_F_LLTX; 2191 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; 2192 dev->features |= NETIF_F_RXCSUM; 2193 dev->features |= NETIF_F_GSO_SOFTWARE; 2194 2195 dev->vlan_features = dev->features; 2196 dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; 2197 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; 2198 dev->hw_features |= NETIF_F_GSO_SOFTWARE; 2199 dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; 2200 netif_keep_dst(dev); 2201 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 2202 2203 INIT_LIST_HEAD(&vxlan->next); 2204 spin_lock_init(&vxlan->hash_lock); 2205 INIT_WORK(&vxlan->igmp_join, vxlan_igmp_join); 2206 INIT_WORK(&vxlan->igmp_leave, vxlan_igmp_leave); 2207 INIT_WORK(&vxlan->sock_work, vxlan_sock_work); 2208 2209 init_timer_deferrable(&vxlan->age_timer); 2210 vxlan->age_timer.function = vxlan_cleanup; 2211 vxlan->age_timer.data = (unsigned long) vxlan; 2212 2213 vxlan->dst_port = htons(vxlan_port); 2214 2215 vxlan->dev = dev; 2216 2217 for (h = 0; h < FDB_HASH_SIZE; ++h) 2218 INIT_HLIST_HEAD(&vxlan->fdb_head[h]); 2219} 2220 2221static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { 2222 [IFLA_VXLAN_ID] = { .type = NLA_U32 }, 2223 [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, 2224 [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, 2225 [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, 2226 [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, 2227 [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, 2228 [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, 2229 [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, 2230 [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, 2231 [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, 2232 [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, 2233 [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, 2234 [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, 2235 [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, 2236 [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, 2237 [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, 2238 [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, 2239}; 2240 2241static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) 2242{ 2243 if (tb[IFLA_ADDRESS]) { 2244 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { 2245 pr_debug("invalid link address (not ethernet)\n"); 2246 return -EINVAL; 2247 } 2248 2249 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { 2250 pr_debug("invalid all zero ethernet address\n"); 2251 return -EADDRNOTAVAIL; 2252 } 2253 } 2254 2255 if (!data) 2256 return -EINVAL; 2257 2258 if (data[IFLA_VXLAN_ID]) { 2259 __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); 2260 if (id >= VXLAN_VID_MASK) 2261 return -ERANGE; 2262 } 2263 2264 if (data[IFLA_VXLAN_PORT_RANGE]) { 2265 const struct ifla_vxlan_port_range *p 2266 = nla_data(data[IFLA_VXLAN_PORT_RANGE]); 2267 2268 if (ntohs(p->high) < ntohs(p->low)) { 2269 pr_debug("port range %u .. %u not valid\n", 2270 ntohs(p->low), ntohs(p->high)); 2271 return -EINVAL; 2272 } 2273 } 2274 2275 return 0; 2276} 2277 2278static void vxlan_get_drvinfo(struct net_device *netdev, 2279 struct ethtool_drvinfo *drvinfo) 2280{ 2281 strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); 2282 strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); 2283} 2284 2285static const struct ethtool_ops vxlan_ethtool_ops = { 2286 .get_drvinfo = vxlan_get_drvinfo, 2287 .get_link = ethtool_op_get_link, 2288}; 2289 2290static void vxlan_del_work(struct work_struct *work) 2291{ 2292 struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work); 2293 udp_tunnel_sock_release(vs->sock); 2294 kfree_rcu(vs, rcu); 2295} 2296 2297static struct socket *vxlan_create_sock(struct net *net, bool ipv6, 2298 __be16 port, u32 flags) 2299{ 2300 struct socket *sock; 2301 struct udp_port_cfg udp_conf; 2302 int err; 2303 2304 memset(&udp_conf, 0, sizeof(udp_conf)); 2305 2306 if (ipv6) { 2307 udp_conf.family = AF_INET6; 2308 udp_conf.use_udp6_tx_checksums = 2309 !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); 2310 udp_conf.use_udp6_rx_checksums = 2311 !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); 2312 } else { 2313 udp_conf.family = AF_INET; 2314 udp_conf.local_ip.s_addr = INADDR_ANY; 2315 udp_conf.use_udp_checksums = 2316 !!(flags & VXLAN_F_UDP_CSUM); 2317 } 2318 2319 udp_conf.local_udp_port = port; 2320 2321 /* Open UDP socket */ 2322 err = udp_sock_create(net, &udp_conf, &sock); 2323 if (err < 0) 2324 return ERR_PTR(err); 2325 2326 return sock; 2327} 2328 2329/* Create new listen socket if needed */ 2330static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, 2331 vxlan_rcv_t *rcv, void *data, 2332 u32 flags) 2333{ 2334 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2335 struct vxlan_sock *vs; 2336 struct socket *sock; 2337 unsigned int h; 2338 bool ipv6 = !!(flags & VXLAN_F_IPV6); 2339 struct udp_tunnel_sock_cfg tunnel_cfg; 2340 2341 vs = kzalloc(sizeof(*vs), GFP_KERNEL); 2342 if (!vs) 2343 return ERR_PTR(-ENOMEM); 2344 2345 for (h = 0; h < VNI_HASH_SIZE; ++h) 2346 INIT_HLIST_HEAD(&vs->vni_list[h]); 2347 2348 INIT_WORK(&vs->del_work, vxlan_del_work); 2349 2350 sock = vxlan_create_sock(net, ipv6, port, flags); 2351 if (IS_ERR(sock)) { 2352 kfree(vs); 2353 return ERR_CAST(sock); 2354 } 2355 2356 vs->sock = sock; 2357 atomic_set(&vs->refcnt, 1); 2358 vs->rcv = rcv; 2359 vs->data = data; 2360 2361 /* Initialize the vxlan udp offloads structure */ 2362 vs->udp_offloads.port = port; 2363 vs->udp_offloads.callbacks.gro_receive = vxlan_gro_receive; 2364 vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete; 2365 2366 spin_lock(&vn->sock_lock); 2367 hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); 2368 vxlan_notify_add_rx_port(vs); 2369 spin_unlock(&vn->sock_lock); 2370 2371 /* Mark socket as an encapsulation socket. */ 2372 tunnel_cfg.sk_user_data = vs; 2373 tunnel_cfg.encap_type = 1; 2374 tunnel_cfg.encap_rcv = vxlan_udp_encap_recv; 2375 tunnel_cfg.encap_destroy = NULL; 2376 2377 setup_udp_tunnel_sock(net, sock, &tunnel_cfg); 2378 2379 return vs; 2380} 2381 2382struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, 2383 vxlan_rcv_t *rcv, void *data, 2384 bool no_share, u32 flags) 2385{ 2386 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2387 struct vxlan_sock *vs; 2388 bool ipv6 = flags & VXLAN_F_IPV6; 2389 2390 vs = vxlan_socket_create(net, port, rcv, data, flags); 2391 if (!IS_ERR(vs)) 2392 return vs; 2393 2394 if (no_share) /* Return error if sharing is not allowed. */ 2395 return vs; 2396 2397 spin_lock(&vn->sock_lock); 2398 vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port); 2399 if (vs) { 2400 if (vs->rcv == rcv) 2401 atomic_inc(&vs->refcnt); 2402 else 2403 vs = ERR_PTR(-EBUSY); 2404 } 2405 spin_unlock(&vn->sock_lock); 2406 2407 if (!vs) 2408 vs = ERR_PTR(-EINVAL); 2409 2410 return vs; 2411} 2412EXPORT_SYMBOL_GPL(vxlan_sock_add); 2413 2414/* Scheduled at device creation to bind to a socket */ 2415static void vxlan_sock_work(struct work_struct *work) 2416{ 2417 struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, sock_work); 2418 struct net *net = vxlan->net; 2419 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2420 __be16 port = vxlan->dst_port; 2421 struct vxlan_sock *nvs; 2422 2423 nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags); 2424 spin_lock(&vn->sock_lock); 2425 if (!IS_ERR(nvs)) 2426 vxlan_vs_add_dev(nvs, vxlan); 2427 spin_unlock(&vn->sock_lock); 2428 2429 dev_put(vxlan->dev); 2430} 2431 2432static int vxlan_newlink(struct net *net, struct net_device *dev, 2433 struct nlattr *tb[], struct nlattr *data[]) 2434{ 2435 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2436 struct vxlan_dev *vxlan = netdev_priv(dev); 2437 struct vxlan_rdst *dst = &vxlan->default_dst; 2438 __u32 vni; 2439 int err; 2440 bool use_ipv6 = false; 2441 2442 if (!data[IFLA_VXLAN_ID]) 2443 return -EINVAL; 2444 2445 vxlan->net = dev_net(dev); 2446 2447 vni = nla_get_u32(data[IFLA_VXLAN_ID]); 2448 dst->remote_vni = vni; 2449 2450 /* Unless IPv6 is explicitly requested, assume IPv4 */ 2451 dst->remote_ip.sa.sa_family = AF_INET; 2452 if (data[IFLA_VXLAN_GROUP]) { 2453 dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]); 2454 } else if (data[IFLA_VXLAN_GROUP6]) { 2455 if (!IS_ENABLED(CONFIG_IPV6)) 2456 return -EPFNOSUPPORT; 2457 2458 nla_memcpy(&dst->remote_ip.sin6.sin6_addr, data[IFLA_VXLAN_GROUP6], 2459 sizeof(struct in6_addr)); 2460 dst->remote_ip.sa.sa_family = AF_INET6; 2461 use_ipv6 = true; 2462 } 2463 2464 if (data[IFLA_VXLAN_LOCAL]) { 2465 vxlan->saddr.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_LOCAL]); 2466 vxlan->saddr.sa.sa_family = AF_INET; 2467 } else if (data[IFLA_VXLAN_LOCAL6]) { 2468 if (!IS_ENABLED(CONFIG_IPV6)) 2469 return -EPFNOSUPPORT; 2470 2471 /* TODO: respect scope id */ 2472 nla_memcpy(&vxlan->saddr.sin6.sin6_addr, data[IFLA_VXLAN_LOCAL6], 2473 sizeof(struct in6_addr)); 2474 vxlan->saddr.sa.sa_family = AF_INET6; 2475 use_ipv6 = true; 2476 } 2477 2478 if (data[IFLA_VXLAN_LINK] && 2479 (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) { 2480 struct net_device *lowerdev 2481 = __dev_get_by_index(net, dst->remote_ifindex); 2482 2483 if (!lowerdev) { 2484 pr_info("ifindex %d does not exist\n", dst->remote_ifindex); 2485 return -ENODEV; 2486 } 2487 2488#if IS_ENABLED(CONFIG_IPV6) 2489 if (use_ipv6) { 2490 struct inet6_dev *idev = __in6_dev_get(lowerdev); 2491 if (idev && idev->cnf.disable_ipv6) { 2492 pr_info("IPv6 is disabled via sysctl\n"); 2493 return -EPERM; 2494 } 2495 vxlan->flags |= VXLAN_F_IPV6; 2496 } 2497#endif 2498 2499 if (!tb[IFLA_MTU]) 2500 dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); 2501 2502 dev->needed_headroom = lowerdev->hard_header_len + 2503 (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); 2504 } else if (use_ipv6) 2505 vxlan->flags |= VXLAN_F_IPV6; 2506 2507 if (data[IFLA_VXLAN_TOS]) 2508 vxlan->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); 2509 2510 if (data[IFLA_VXLAN_TTL]) 2511 vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); 2512 2513 if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING])) 2514 vxlan->flags |= VXLAN_F_LEARN; 2515 2516 if (data[IFLA_VXLAN_AGEING]) 2517 vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); 2518 else 2519 vxlan->age_interval = FDB_AGE_DEFAULT; 2520 2521 if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY])) 2522 vxlan->flags |= VXLAN_F_PROXY; 2523 2524 if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC])) 2525 vxlan->flags |= VXLAN_F_RSC; 2526 2527 if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS])) 2528 vxlan->flags |= VXLAN_F_L2MISS; 2529 2530 if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS])) 2531 vxlan->flags |= VXLAN_F_L3MISS; 2532 2533 if (data[IFLA_VXLAN_LIMIT]) 2534 vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); 2535 2536 if (data[IFLA_VXLAN_PORT_RANGE]) { 2537 const struct ifla_vxlan_port_range *p 2538 = nla_data(data[IFLA_VXLAN_PORT_RANGE]); 2539 vxlan->port_min = ntohs(p->low); 2540 vxlan->port_max = ntohs(p->high); 2541 } 2542 2543 if (data[IFLA_VXLAN_PORT]) 2544 vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); 2545 2546 if (data[IFLA_VXLAN_UDP_CSUM] && nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) 2547 vxlan->flags |= VXLAN_F_UDP_CSUM; 2548 2549 if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] && 2550 nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX])) 2551 vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_TX; 2552 2553 if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] && 2554 nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) 2555 vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; 2556 2557 if (vxlan_find_vni(net, vni, use_ipv6 ? AF_INET6 : AF_INET, 2558 vxlan->dst_port)) { 2559 pr_info("duplicate VNI %u\n", vni); 2560 return -EEXIST; 2561 } 2562 2563 dev->ethtool_ops = &vxlan_ethtool_ops; 2564 2565 /* create an fdb entry for a valid default destination */ 2566 if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { 2567 err = vxlan_fdb_create(vxlan, all_zeros_mac, 2568 &vxlan->default_dst.remote_ip, 2569 NUD_REACHABLE|NUD_PERMANENT, 2570 NLM_F_EXCL|NLM_F_CREATE, 2571 vxlan->dst_port, 2572 vxlan->default_dst.remote_vni, 2573 vxlan->default_dst.remote_ifindex, 2574 NTF_SELF); 2575 if (err) 2576 return err; 2577 } 2578 2579 err = register_netdevice(dev); 2580 if (err) { 2581 vxlan_fdb_delete_default(vxlan); 2582 return err; 2583 } 2584 2585 list_add(&vxlan->next, &vn->vxlan_list); 2586 2587 return 0; 2588} 2589 2590static void vxlan_dellink(struct net_device *dev, struct list_head *head) 2591{ 2592 struct vxlan_dev *vxlan = netdev_priv(dev); 2593 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2594 2595 spin_lock(&vn->sock_lock); 2596 if (!hlist_unhashed(&vxlan->hlist)) 2597 hlist_del_rcu(&vxlan->hlist); 2598 spin_unlock(&vn->sock_lock); 2599 2600 list_del(&vxlan->next); 2601 unregister_netdevice_queue(dev, head); 2602} 2603 2604static size_t vxlan_get_size(const struct net_device *dev) 2605{ 2606 2607 return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ 2608 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ 2609 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ 2610 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ 2611 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ 2612 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ 2613 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ 2614 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ 2615 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ 2616 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ 2617 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ 2618 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ 2619 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ 2620 nla_total_size(sizeof(struct ifla_vxlan_port_range)) + 2621 nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ 2622 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ 2623 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ 2624 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ 2625 0; 2626} 2627 2628static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) 2629{ 2630 const struct vxlan_dev *vxlan = netdev_priv(dev); 2631 const struct vxlan_rdst *dst = &vxlan->default_dst; 2632 struct ifla_vxlan_port_range ports = { 2633 .low = htons(vxlan->port_min), 2634 .high = htons(vxlan->port_max), 2635 }; 2636 2637 if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni)) 2638 goto nla_put_failure; 2639 2640 if (!vxlan_addr_any(&dst->remote_ip)) { 2641 if (dst->remote_ip.sa.sa_family == AF_INET) { 2642 if (nla_put_be32(skb, IFLA_VXLAN_GROUP, 2643 dst->remote_ip.sin.sin_addr.s_addr)) 2644 goto nla_put_failure; 2645#if IS_ENABLED(CONFIG_IPV6) 2646 } else { 2647 if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr), 2648 &dst->remote_ip.sin6.sin6_addr)) 2649 goto nla_put_failure; 2650#endif 2651 } 2652 } 2653 2654 if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) 2655 goto nla_put_failure; 2656 2657 if (!vxlan_addr_any(&vxlan->saddr)) { 2658 if (vxlan->saddr.sa.sa_family == AF_INET) { 2659 if (nla_put_be32(skb, IFLA_VXLAN_LOCAL, 2660 vxlan->saddr.sin.sin_addr.s_addr)) 2661 goto nla_put_failure; 2662#if IS_ENABLED(CONFIG_IPV6) 2663 } else { 2664 if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr), 2665 &vxlan->saddr.sin6.sin6_addr)) 2666 goto nla_put_failure; 2667#endif 2668 } 2669 } 2670 2671 if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) || 2672 nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) || 2673 nla_put_u8(skb, IFLA_VXLAN_LEARNING, 2674 !!(vxlan->flags & VXLAN_F_LEARN)) || 2675 nla_put_u8(skb, IFLA_VXLAN_PROXY, 2676 !!(vxlan->flags & VXLAN_F_PROXY)) || 2677 nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) || 2678 nla_put_u8(skb, IFLA_VXLAN_L2MISS, 2679 !!(vxlan->flags & VXLAN_F_L2MISS)) || 2680 nla_put_u8(skb, IFLA_VXLAN_L3MISS, 2681 !!(vxlan->flags & VXLAN_F_L3MISS)) || 2682 nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) || 2683 nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) || 2684 nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) || 2685 nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, 2686 !!(vxlan->flags & VXLAN_F_UDP_CSUM)) || 2687 nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, 2688 !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || 2689 nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 2690 !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX))) 2691 goto nla_put_failure; 2692 2693 if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) 2694 goto nla_put_failure; 2695 2696 return 0; 2697 2698nla_put_failure: 2699 return -EMSGSIZE; 2700} 2701 2702static struct rtnl_link_ops vxlan_link_ops __read_mostly = { 2703 .kind = "vxlan", 2704 .maxtype = IFLA_VXLAN_MAX, 2705 .policy = vxlan_policy, 2706 .priv_size = sizeof(struct vxlan_dev), 2707 .setup = vxlan_setup, 2708 .validate = vxlan_validate, 2709 .newlink = vxlan_newlink, 2710 .dellink = vxlan_dellink, 2711 .get_size = vxlan_get_size, 2712 .fill_info = vxlan_fill_info, 2713}; 2714 2715static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, 2716 struct net_device *dev) 2717{ 2718 struct vxlan_dev *vxlan, *next; 2719 LIST_HEAD(list_kill); 2720 2721 list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { 2722 struct vxlan_rdst *dst = &vxlan->default_dst; 2723 2724 /* In case we created vxlan device with carrier 2725 * and we loose the carrier due to module unload 2726 * we also need to remove vxlan device. In other 2727 * cases, it's not necessary and remote_ifindex 2728 * is 0 here, so no matches. 2729 */ 2730 if (dst->remote_ifindex == dev->ifindex) 2731 vxlan_dellink(vxlan->dev, &list_kill); 2732 } 2733 2734 unregister_netdevice_many(&list_kill); 2735} 2736 2737static int vxlan_lowerdev_event(struct notifier_block *unused, 2738 unsigned long event, void *ptr) 2739{ 2740 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 2741 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); 2742 2743 if (event == NETDEV_UNREGISTER) 2744 vxlan_handle_lowerdev_unregister(vn, dev); 2745 2746 return NOTIFY_DONE; 2747} 2748 2749static struct notifier_block vxlan_notifier_block __read_mostly = { 2750 .notifier_call = vxlan_lowerdev_event, 2751}; 2752 2753static __net_init int vxlan_init_net(struct net *net) 2754{ 2755 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2756 unsigned int h; 2757 2758 INIT_LIST_HEAD(&vn->vxlan_list); 2759 spin_lock_init(&vn->sock_lock); 2760 2761 for (h = 0; h < PORT_HASH_SIZE; ++h) 2762 INIT_HLIST_HEAD(&vn->sock_list[h]); 2763 2764 return 0; 2765} 2766 2767static void __net_exit vxlan_exit_net(struct net *net) 2768{ 2769 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2770 struct vxlan_dev *vxlan, *next; 2771 struct net_device *dev, *aux; 2772 LIST_HEAD(list); 2773 2774 rtnl_lock(); 2775 for_each_netdev_safe(net, dev, aux) 2776 if (dev->rtnl_link_ops == &vxlan_link_ops) 2777 unregister_netdevice_queue(dev, &list); 2778 2779 list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { 2780 /* If vxlan->dev is in the same netns, it has already been added 2781 * to the list by the previous loop. 2782 */ 2783 if (!net_eq(dev_net(vxlan->dev), net)) 2784 unregister_netdevice_queue(dev, &list); 2785 } 2786 2787 unregister_netdevice_many(&list); 2788 rtnl_unlock(); 2789} 2790 2791static struct pernet_operations vxlan_net_ops = { 2792 .init = vxlan_init_net, 2793 .exit = vxlan_exit_net, 2794 .id = &vxlan_net_id, 2795 .size = sizeof(struct vxlan_net), 2796}; 2797 2798static int __init vxlan_init_module(void) 2799{ 2800 int rc; 2801 2802 vxlan_wq = alloc_workqueue("vxlan", 0, 0); 2803 if (!vxlan_wq) 2804 return -ENOMEM; 2805 2806 get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); 2807 2808 rc = register_pernet_subsys(&vxlan_net_ops); 2809 if (rc) 2810 goto out1; 2811 2812 rc = register_netdevice_notifier(&vxlan_notifier_block); 2813 if (rc) 2814 goto out2; 2815 2816 rc = rtnl_link_register(&vxlan_link_ops); 2817 if (rc) 2818 goto out3; 2819 2820 return 0; 2821out3: 2822 unregister_netdevice_notifier(&vxlan_notifier_block); 2823out2: 2824 unregister_pernet_subsys(&vxlan_net_ops); 2825out1: 2826 destroy_workqueue(vxlan_wq); 2827 return rc; 2828} 2829late_initcall(vxlan_init_module); 2830 2831static void __exit vxlan_cleanup_module(void) 2832{ 2833 rtnl_link_unregister(&vxlan_link_ops); 2834 unregister_netdevice_notifier(&vxlan_notifier_block); 2835 destroy_workqueue(vxlan_wq); 2836 unregister_pernet_subsys(&vxlan_net_ops); 2837 /* rcu_barrier() is called by netns */ 2838} 2839module_exit(vxlan_cleanup_module); 2840 2841MODULE_LICENSE("GPL"); 2842MODULE_VERSION(VXLAN_VERSION); 2843MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>"); 2844MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic"); 2845MODULE_ALIAS_RTNL_LINK("vxlan");