Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.17-rc1 2896 lines 74 kB view raw
1/* 2 * VXLAN: Virtual eXtensible Local Area Network 3 * 4 * Copyright (c) 2012-2013 Vyatta Inc. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13#include <linux/kernel.h> 14#include <linux/types.h> 15#include <linux/module.h> 16#include <linux/errno.h> 17#include <linux/slab.h> 18#include <linux/skbuff.h> 19#include <linux/rculist.h> 20#include <linux/netdevice.h> 21#include <linux/in.h> 22#include <linux/ip.h> 23#include <linux/udp.h> 24#include <linux/igmp.h> 25#include <linux/etherdevice.h> 26#include <linux/if_ether.h> 27#include <linux/if_vlan.h> 28#include <linux/hash.h> 29#include <linux/ethtool.h> 30#include <net/arp.h> 31#include <net/ndisc.h> 32#include <net/ip.h> 33#include <net/ip_tunnels.h> 34#include <net/icmp.h> 35#include <net/udp.h> 36#include <net/udp_tunnel.h> 37#include <net/rtnetlink.h> 38#include <net/route.h> 39#include <net/dsfield.h> 40#include <net/inet_ecn.h> 41#include <net/net_namespace.h> 42#include <net/netns/generic.h> 43#include <net/vxlan.h> 44#include <net/protocol.h> 45#if IS_ENABLED(CONFIG_IPV6) 46#include <net/ipv6.h> 47#include <net/addrconf.h> 48#include <net/ip6_tunnel.h> 49#include <net/ip6_checksum.h> 50#endif 51 52#define VXLAN_VERSION "0.1" 53 54#define PORT_HASH_BITS 8 55#define PORT_HASH_SIZE (1<<PORT_HASH_BITS) 56#define VNI_HASH_BITS 10 57#define VNI_HASH_SIZE (1<<VNI_HASH_BITS) 58#define FDB_HASH_BITS 8 59#define FDB_HASH_SIZE (1<<FDB_HASH_BITS) 60#define FDB_AGE_DEFAULT 300 /* 5 min */ 61#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ 62 63#define VXLAN_N_VID (1u << 24) 64#define VXLAN_VID_MASK (VXLAN_N_VID - 1) 65#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) 66 67#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ 68 69/* VXLAN protocol header */ 70struct vxlanhdr { 71 __be32 vx_flags; 72 __be32 vx_vni; 73}; 74 75/* UDP port for VXLAN traffic. 76 * The IANA assigned port is 4789, but the Linux default is 8472 77 * for compatibility with early adopters. 78 */ 79static unsigned short vxlan_port __read_mostly = 8472; 80module_param_named(udp_port, vxlan_port, ushort, 0444); 81MODULE_PARM_DESC(udp_port, "Destination UDP port"); 82 83static bool log_ecn_error = true; 84module_param(log_ecn_error, bool, 0644); 85MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 86 87static int vxlan_net_id; 88 89static const u8 all_zeros_mac[ETH_ALEN]; 90 91/* per-network namespace private data for this module */ 92struct vxlan_net { 93 struct list_head vxlan_list; 94 struct hlist_head sock_list[PORT_HASH_SIZE]; 95 spinlock_t sock_lock; 96}; 97 98union vxlan_addr { 99 struct sockaddr_in sin; 100 struct sockaddr_in6 sin6; 101 struct sockaddr sa; 102}; 103 104struct vxlan_rdst { 105 union vxlan_addr remote_ip; 106 __be16 remote_port; 107 u32 remote_vni; 108 u32 remote_ifindex; 109 struct list_head list; 110 struct rcu_head rcu; 111}; 112 113/* Forwarding table entry */ 114struct vxlan_fdb { 115 struct hlist_node hlist; /* linked list of entries */ 116 struct rcu_head rcu; 117 unsigned long updated; /* jiffies */ 118 unsigned long used; 119 struct list_head remotes; 120 u16 state; /* see ndm_state */ 121 u8 flags; /* see ndm_flags */ 122 u8 eth_addr[ETH_ALEN]; 123}; 124 125/* Pseudo network device */ 126struct vxlan_dev { 127 struct hlist_node hlist; /* vni hash table */ 128 struct list_head next; /* vxlan's per namespace list */ 129 struct vxlan_sock *vn_sock; /* listening socket */ 130 struct net_device *dev; 131 struct net *net; /* netns for packet i/o */ 132 struct vxlan_rdst default_dst; /* default destination */ 133 union vxlan_addr saddr; /* source address */ 134 __be16 dst_port; 135 __u16 port_min; /* source port range */ 136 __u16 port_max; 137 __u8 tos; /* TOS override */ 138 __u8 ttl; 139 u32 flags; /* VXLAN_F_* in vxlan.h */ 140 141 struct work_struct sock_work; 142 struct work_struct igmp_join; 143 struct work_struct igmp_leave; 144 145 unsigned long age_interval; 146 struct timer_list age_timer; 147 spinlock_t hash_lock; 148 unsigned int addrcnt; 149 unsigned int addrmax; 150 151 struct hlist_head fdb_head[FDB_HASH_SIZE]; 152}; 153 154/* salt for hash table */ 155static u32 vxlan_salt __read_mostly; 156static struct workqueue_struct *vxlan_wq; 157 158static void vxlan_sock_work(struct work_struct *work); 159 160#if IS_ENABLED(CONFIG_IPV6) 161static inline 162bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) 163{ 164 if (a->sa.sa_family != b->sa.sa_family) 165 return false; 166 if (a->sa.sa_family == AF_INET6) 167 return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); 168 else 169 return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; 170} 171 172static inline bool vxlan_addr_any(const union vxlan_addr *ipa) 173{ 174 if (ipa->sa.sa_family == AF_INET6) 175 return ipv6_addr_any(&ipa->sin6.sin6_addr); 176 else 177 return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); 178} 179 180static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) 181{ 182 if (ipa->sa.sa_family == AF_INET6) 183 return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr); 184 else 185 return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); 186} 187 188static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) 189{ 190 if (nla_len(nla) >= sizeof(struct in6_addr)) { 191 nla_memcpy(&ip->sin6.sin6_addr, nla, sizeof(struct in6_addr)); 192 ip->sa.sa_family = AF_INET6; 193 return 0; 194 } else if (nla_len(nla) >= sizeof(__be32)) { 195 ip->sin.sin_addr.s_addr = nla_get_be32(nla); 196 ip->sa.sa_family = AF_INET; 197 return 0; 198 } else { 199 return -EAFNOSUPPORT; 200 } 201} 202 203static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, 204 const union vxlan_addr *ip) 205{ 206 if (ip->sa.sa_family == AF_INET6) 207 return nla_put(skb, attr, sizeof(struct in6_addr), &ip->sin6.sin6_addr); 208 else 209 return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr); 210} 211 212#else /* !CONFIG_IPV6 */ 213 214static inline 215bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) 216{ 217 return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; 218} 219 220static inline bool vxlan_addr_any(const union vxlan_addr *ipa) 221{ 222 return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); 223} 224 225static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) 226{ 227 return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); 228} 229 230static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) 231{ 232 if (nla_len(nla) >= sizeof(struct in6_addr)) { 233 return -EAFNOSUPPORT; 234 } else if (nla_len(nla) >= sizeof(__be32)) { 235 ip->sin.sin_addr.s_addr = nla_get_be32(nla); 236 ip->sa.sa_family = AF_INET; 237 return 0; 238 } else { 239 return -EAFNOSUPPORT; 240 } 241} 242 243static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, 244 const union vxlan_addr *ip) 245{ 246 return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr); 247} 248#endif 249 250/* Virtual Network hash table head */ 251static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id) 252{ 253 return &vs->vni_list[hash_32(id, VNI_HASH_BITS)]; 254} 255 256/* Socket hash table head */ 257static inline struct hlist_head *vs_head(struct net *net, __be16 port) 258{ 259 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 260 261 return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; 262} 263 264/* First remote destination for a forwarding entry. 265 * Guaranteed to be non-NULL because remotes are never deleted. 266 */ 267static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) 268{ 269 return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); 270} 271 272static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) 273{ 274 return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); 275} 276 277/* Find VXLAN socket based on network namespace and UDP port */ 278static struct vxlan_sock *vxlan_find_sock(struct net *net, __be16 port) 279{ 280 struct vxlan_sock *vs; 281 282 hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { 283 if (inet_sk(vs->sock->sk)->inet_sport == port) 284 return vs; 285 } 286 return NULL; 287} 288 289static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, u32 id) 290{ 291 struct vxlan_dev *vxlan; 292 293 hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) { 294 if (vxlan->default_dst.remote_vni == id) 295 return vxlan; 296 } 297 298 return NULL; 299} 300 301/* Look up VNI in a per net namespace table */ 302static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port) 303{ 304 struct vxlan_sock *vs; 305 306 vs = vxlan_find_sock(net, port); 307 if (!vs) 308 return NULL; 309 310 return vxlan_vs_find_vni(vs, id); 311} 312 313/* Fill in neighbour message in skbuff. */ 314static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, 315 const struct vxlan_fdb *fdb, 316 u32 portid, u32 seq, int type, unsigned int flags, 317 const struct vxlan_rdst *rdst) 318{ 319 unsigned long now = jiffies; 320 struct nda_cacheinfo ci; 321 struct nlmsghdr *nlh; 322 struct ndmsg *ndm; 323 bool send_ip, send_eth; 324 325 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); 326 if (nlh == NULL) 327 return -EMSGSIZE; 328 329 ndm = nlmsg_data(nlh); 330 memset(ndm, 0, sizeof(*ndm)); 331 332 send_eth = send_ip = true; 333 334 if (type == RTM_GETNEIGH) { 335 ndm->ndm_family = AF_INET; 336 send_ip = !vxlan_addr_any(&rdst->remote_ip); 337 send_eth = !is_zero_ether_addr(fdb->eth_addr); 338 } else 339 ndm->ndm_family = AF_BRIDGE; 340 ndm->ndm_state = fdb->state; 341 ndm->ndm_ifindex = vxlan->dev->ifindex; 342 ndm->ndm_flags = fdb->flags; 343 ndm->ndm_type = RTN_UNICAST; 344 345 if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) 346 goto nla_put_failure; 347 348 if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) 349 goto nla_put_failure; 350 351 if (rdst->remote_port && rdst->remote_port != vxlan->dst_port && 352 nla_put_be16(skb, NDA_PORT, rdst->remote_port)) 353 goto nla_put_failure; 354 if (rdst->remote_vni != vxlan->default_dst.remote_vni && 355 nla_put_u32(skb, NDA_VNI, rdst->remote_vni)) 356 goto nla_put_failure; 357 if (rdst->remote_ifindex && 358 nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) 359 goto nla_put_failure; 360 361 ci.ndm_used = jiffies_to_clock_t(now - fdb->used); 362 ci.ndm_confirmed = 0; 363 ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); 364 ci.ndm_refcnt = 0; 365 366 if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) 367 goto nla_put_failure; 368 369 return nlmsg_end(skb, nlh); 370 371nla_put_failure: 372 nlmsg_cancel(skb, nlh); 373 return -EMSGSIZE; 374} 375 376static inline size_t vxlan_nlmsg_size(void) 377{ 378 return NLMSG_ALIGN(sizeof(struct ndmsg)) 379 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ 380 + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ 381 + nla_total_size(sizeof(__be16)) /* NDA_PORT */ 382 + nla_total_size(sizeof(__be32)) /* NDA_VNI */ 383 + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ 384 + nla_total_size(sizeof(struct nda_cacheinfo)); 385} 386 387static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, 388 struct vxlan_rdst *rd, int type) 389{ 390 struct net *net = dev_net(vxlan->dev); 391 struct sk_buff *skb; 392 int err = -ENOBUFS; 393 394 skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); 395 if (skb == NULL) 396 goto errout; 397 398 err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd); 399 if (err < 0) { 400 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ 401 WARN_ON(err == -EMSGSIZE); 402 kfree_skb(skb); 403 goto errout; 404 } 405 406 rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); 407 return; 408errout: 409 if (err < 0) 410 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); 411} 412 413static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa) 414{ 415 struct vxlan_dev *vxlan = netdev_priv(dev); 416 struct vxlan_fdb f = { 417 .state = NUD_STALE, 418 }; 419 struct vxlan_rdst remote = { 420 .remote_ip = *ipa, /* goes to NDA_DST */ 421 .remote_vni = VXLAN_N_VID, 422 }; 423 424 vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH); 425} 426 427static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) 428{ 429 struct vxlan_fdb f = { 430 .state = NUD_STALE, 431 }; 432 struct vxlan_rdst remote = { }; 433 434 memcpy(f.eth_addr, eth_addr, ETH_ALEN); 435 436 vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH); 437} 438 439/* Hash Ethernet address */ 440static u32 eth_hash(const unsigned char *addr) 441{ 442 u64 value = get_unaligned((u64 *)addr); 443 444 /* only want 6 bytes */ 445#ifdef __BIG_ENDIAN 446 value >>= 16; 447#else 448 value <<= 16; 449#endif 450 return hash_64(value, FDB_HASH_BITS); 451} 452 453/* Hash chain to use given mac address */ 454static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, 455 const u8 *mac) 456{ 457 return &vxlan->fdb_head[eth_hash(mac)]; 458} 459 460/* Look up Ethernet address in forwarding table */ 461static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, 462 const u8 *mac) 463{ 464 struct hlist_head *head = vxlan_fdb_head(vxlan, mac); 465 struct vxlan_fdb *f; 466 467 hlist_for_each_entry_rcu(f, head, hlist) { 468 if (ether_addr_equal(mac, f->eth_addr)) 469 return f; 470 } 471 472 return NULL; 473} 474 475static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, 476 const u8 *mac) 477{ 478 struct vxlan_fdb *f; 479 480 f = __vxlan_find_mac(vxlan, mac); 481 if (f) 482 f->used = jiffies; 483 484 return f; 485} 486 487/* caller should hold vxlan->hash_lock */ 488static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, 489 union vxlan_addr *ip, __be16 port, 490 __u32 vni, __u32 ifindex) 491{ 492 struct vxlan_rdst *rd; 493 494 list_for_each_entry(rd, &f->remotes, list) { 495 if (vxlan_addr_equal(&rd->remote_ip, ip) && 496 rd->remote_port == port && 497 rd->remote_vni == vni && 498 rd->remote_ifindex == ifindex) 499 return rd; 500 } 501 502 return NULL; 503} 504 505/* Replace destination of unicast mac */ 506static int vxlan_fdb_replace(struct vxlan_fdb *f, 507 union vxlan_addr *ip, __be16 port, __u32 vni, __u32 ifindex) 508{ 509 struct vxlan_rdst *rd; 510 511 rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); 512 if (rd) 513 return 0; 514 515 rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); 516 if (!rd) 517 return 0; 518 rd->remote_ip = *ip; 519 rd->remote_port = port; 520 rd->remote_vni = vni; 521 rd->remote_ifindex = ifindex; 522 return 1; 523} 524 525/* Add/update destinations for multicast */ 526static int vxlan_fdb_append(struct vxlan_fdb *f, 527 union vxlan_addr *ip, __be16 port, __u32 vni, 528 __u32 ifindex, struct vxlan_rdst **rdp) 529{ 530 struct vxlan_rdst *rd; 531 532 rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); 533 if (rd) 534 return 0; 535 536 rd = kmalloc(sizeof(*rd), GFP_ATOMIC); 537 if (rd == NULL) 538 return -ENOBUFS; 539 rd->remote_ip = *ip; 540 rd->remote_port = port; 541 rd->remote_vni = vni; 542 rd->remote_ifindex = ifindex; 543 544 list_add_tail_rcu(&rd->list, &f->remotes); 545 546 *rdp = rd; 547 return 1; 548} 549 550static struct sk_buff **vxlan_gro_receive(struct sk_buff **head, struct sk_buff *skb) 551{ 552 struct sk_buff *p, **pp = NULL; 553 struct vxlanhdr *vh, *vh2; 554 struct ethhdr *eh, *eh2; 555 unsigned int hlen, off_vx, off_eth; 556 const struct packet_offload *ptype; 557 __be16 type; 558 int flush = 1; 559 560 off_vx = skb_gro_offset(skb); 561 hlen = off_vx + sizeof(*vh); 562 vh = skb_gro_header_fast(skb, off_vx); 563 if (skb_gro_header_hard(skb, hlen)) { 564 vh = skb_gro_header_slow(skb, hlen, off_vx); 565 if (unlikely(!vh)) 566 goto out; 567 } 568 skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ 569 skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); 570 571 off_eth = skb_gro_offset(skb); 572 hlen = off_eth + sizeof(*eh); 573 eh = skb_gro_header_fast(skb, off_eth); 574 if (skb_gro_header_hard(skb, hlen)) { 575 eh = skb_gro_header_slow(skb, hlen, off_eth); 576 if (unlikely(!eh)) 577 goto out; 578 } 579 580 flush = 0; 581 582 for (p = *head; p; p = p->next) { 583 if (!NAPI_GRO_CB(p)->same_flow) 584 continue; 585 586 vh2 = (struct vxlanhdr *)(p->data + off_vx); 587 eh2 = (struct ethhdr *)(p->data + off_eth); 588 if (vh->vx_vni != vh2->vx_vni || compare_ether_header(eh, eh2)) { 589 NAPI_GRO_CB(p)->same_flow = 0; 590 continue; 591 } 592 } 593 594 type = eh->h_proto; 595 596 rcu_read_lock(); 597 ptype = gro_find_receive_by_type(type); 598 if (ptype == NULL) { 599 flush = 1; 600 goto out_unlock; 601 } 602 603 skb_gro_pull(skb, sizeof(*eh)); /* pull inner eth header */ 604 skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); 605 pp = ptype->callbacks.gro_receive(head, skb); 606 607out_unlock: 608 rcu_read_unlock(); 609out: 610 NAPI_GRO_CB(skb)->flush |= flush; 611 612 return pp; 613} 614 615static int vxlan_gro_complete(struct sk_buff *skb, int nhoff) 616{ 617 struct ethhdr *eh; 618 struct packet_offload *ptype; 619 __be16 type; 620 int vxlan_len = sizeof(struct vxlanhdr) + sizeof(struct ethhdr); 621 int err = -ENOSYS; 622 623 eh = (struct ethhdr *)(skb->data + nhoff + sizeof(struct vxlanhdr)); 624 type = eh->h_proto; 625 626 rcu_read_lock(); 627 ptype = gro_find_complete_by_type(type); 628 if (ptype != NULL) 629 err = ptype->callbacks.gro_complete(skb, nhoff + vxlan_len); 630 631 rcu_read_unlock(); 632 return err; 633} 634 635/* Notify netdevs that UDP port started listening */ 636static void vxlan_notify_add_rx_port(struct vxlan_sock *vs) 637{ 638 struct net_device *dev; 639 struct sock *sk = vs->sock->sk; 640 struct net *net = sock_net(sk); 641 sa_family_t sa_family = sk->sk_family; 642 __be16 port = inet_sk(sk)->inet_sport; 643 int err; 644 645 if (sa_family == AF_INET) { 646 err = udp_add_offload(&vs->udp_offloads); 647 if (err) 648 pr_warn("vxlan: udp_add_offload failed with status %d\n", err); 649 } 650 651 rcu_read_lock(); 652 for_each_netdev_rcu(net, dev) { 653 if (dev->netdev_ops->ndo_add_vxlan_port) 654 dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, 655 port); 656 } 657 rcu_read_unlock(); 658} 659 660/* Notify netdevs that UDP port is no more listening */ 661static void vxlan_notify_del_rx_port(struct vxlan_sock *vs) 662{ 663 struct net_device *dev; 664 struct sock *sk = vs->sock->sk; 665 struct net *net = sock_net(sk); 666 sa_family_t sa_family = sk->sk_family; 667 __be16 port = inet_sk(sk)->inet_sport; 668 669 rcu_read_lock(); 670 for_each_netdev_rcu(net, dev) { 671 if (dev->netdev_ops->ndo_del_vxlan_port) 672 dev->netdev_ops->ndo_del_vxlan_port(dev, sa_family, 673 port); 674 } 675 rcu_read_unlock(); 676 677 if (sa_family == AF_INET) 678 udp_del_offload(&vs->udp_offloads); 679} 680 681/* Add new entry to forwarding table -- assumes lock held */ 682static int vxlan_fdb_create(struct vxlan_dev *vxlan, 683 const u8 *mac, union vxlan_addr *ip, 684 __u16 state, __u16 flags, 685 __be16 port, __u32 vni, __u32 ifindex, 686 __u8 ndm_flags) 687{ 688 struct vxlan_rdst *rd = NULL; 689 struct vxlan_fdb *f; 690 int notify = 0; 691 692 f = __vxlan_find_mac(vxlan, mac); 693 if (f) { 694 if (flags & NLM_F_EXCL) { 695 netdev_dbg(vxlan->dev, 696 "lost race to create %pM\n", mac); 697 return -EEXIST; 698 } 699 if (f->state != state) { 700 f->state = state; 701 f->updated = jiffies; 702 notify = 1; 703 } 704 if (f->flags != ndm_flags) { 705 f->flags = ndm_flags; 706 f->updated = jiffies; 707 notify = 1; 708 } 709 if ((flags & NLM_F_REPLACE)) { 710 /* Only change unicasts */ 711 if (!(is_multicast_ether_addr(f->eth_addr) || 712 is_zero_ether_addr(f->eth_addr))) { 713 int rc = vxlan_fdb_replace(f, ip, port, vni, 714 ifindex); 715 716 if (rc < 0) 717 return rc; 718 notify |= rc; 719 } else 720 return -EOPNOTSUPP; 721 } 722 if ((flags & NLM_F_APPEND) && 723 (is_multicast_ether_addr(f->eth_addr) || 724 is_zero_ether_addr(f->eth_addr))) { 725 int rc = vxlan_fdb_append(f, ip, port, vni, ifindex, 726 &rd); 727 728 if (rc < 0) 729 return rc; 730 notify |= rc; 731 } 732 } else { 733 if (!(flags & NLM_F_CREATE)) 734 return -ENOENT; 735 736 if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax) 737 return -ENOSPC; 738 739 /* Disallow replace to add a multicast entry */ 740 if ((flags & NLM_F_REPLACE) && 741 (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) 742 return -EOPNOTSUPP; 743 744 netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); 745 f = kmalloc(sizeof(*f), GFP_ATOMIC); 746 if (!f) 747 return -ENOMEM; 748 749 notify = 1; 750 f->state = state; 751 f->flags = ndm_flags; 752 f->updated = f->used = jiffies; 753 INIT_LIST_HEAD(&f->remotes); 754 memcpy(f->eth_addr, mac, ETH_ALEN); 755 756 vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); 757 758 ++vxlan->addrcnt; 759 hlist_add_head_rcu(&f->hlist, 760 vxlan_fdb_head(vxlan, mac)); 761 } 762 763 if (notify) { 764 if (rd == NULL) 765 rd = first_remote_rtnl(f); 766 vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH); 767 } 768 769 return 0; 770} 771 772static void vxlan_fdb_free(struct rcu_head *head) 773{ 774 struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); 775 struct vxlan_rdst *rd, *nd; 776 777 list_for_each_entry_safe(rd, nd, &f->remotes, list) 778 kfree(rd); 779 kfree(f); 780} 781 782static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) 783{ 784 netdev_dbg(vxlan->dev, 785 "delete %pM\n", f->eth_addr); 786 787 --vxlan->addrcnt; 788 vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH); 789 790 hlist_del_rcu(&f->hlist); 791 call_rcu(&f->rcu, vxlan_fdb_free); 792} 793 794static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, 795 union vxlan_addr *ip, __be16 *port, u32 *vni, u32 *ifindex) 796{ 797 struct net *net = dev_net(vxlan->dev); 798 int err; 799 800 if (tb[NDA_DST]) { 801 err = vxlan_nla_get_addr(ip, tb[NDA_DST]); 802 if (err) 803 return err; 804 } else { 805 union vxlan_addr *remote = &vxlan->default_dst.remote_ip; 806 if (remote->sa.sa_family == AF_INET) { 807 ip->sin.sin_addr.s_addr = htonl(INADDR_ANY); 808 ip->sa.sa_family = AF_INET; 809#if IS_ENABLED(CONFIG_IPV6) 810 } else { 811 ip->sin6.sin6_addr = in6addr_any; 812 ip->sa.sa_family = AF_INET6; 813#endif 814 } 815 } 816 817 if (tb[NDA_PORT]) { 818 if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) 819 return -EINVAL; 820 *port = nla_get_be16(tb[NDA_PORT]); 821 } else { 822 *port = vxlan->dst_port; 823 } 824 825 if (tb[NDA_VNI]) { 826 if (nla_len(tb[NDA_VNI]) != sizeof(u32)) 827 return -EINVAL; 828 *vni = nla_get_u32(tb[NDA_VNI]); 829 } else { 830 *vni = vxlan->default_dst.remote_vni; 831 } 832 833 if (tb[NDA_IFINDEX]) { 834 struct net_device *tdev; 835 836 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) 837 return -EINVAL; 838 *ifindex = nla_get_u32(tb[NDA_IFINDEX]); 839 tdev = __dev_get_by_index(net, *ifindex); 840 if (!tdev) 841 return -EADDRNOTAVAIL; 842 } else { 843 *ifindex = 0; 844 } 845 846 return 0; 847} 848 849/* Add static entry (via netlink) */ 850static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], 851 struct net_device *dev, 852 const unsigned char *addr, u16 flags) 853{ 854 struct vxlan_dev *vxlan = netdev_priv(dev); 855 /* struct net *net = dev_net(vxlan->dev); */ 856 union vxlan_addr ip; 857 __be16 port; 858 u32 vni, ifindex; 859 int err; 860 861 if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { 862 pr_info("RTM_NEWNEIGH with invalid state %#x\n", 863 ndm->ndm_state); 864 return -EINVAL; 865 } 866 867 if (tb[NDA_DST] == NULL) 868 return -EINVAL; 869 870 err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex); 871 if (err) 872 return err; 873 874 if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) 875 return -EAFNOSUPPORT; 876 877 spin_lock_bh(&vxlan->hash_lock); 878 err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags, 879 port, vni, ifindex, ndm->ndm_flags); 880 spin_unlock_bh(&vxlan->hash_lock); 881 882 return err; 883} 884 885/* Delete entry (via netlink) */ 886static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], 887 struct net_device *dev, 888 const unsigned char *addr) 889{ 890 struct vxlan_dev *vxlan = netdev_priv(dev); 891 struct vxlan_fdb *f; 892 struct vxlan_rdst *rd = NULL; 893 union vxlan_addr ip; 894 __be16 port; 895 u32 vni, ifindex; 896 int err; 897 898 err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &vni, &ifindex); 899 if (err) 900 return err; 901 902 err = -ENOENT; 903 904 spin_lock_bh(&vxlan->hash_lock); 905 f = vxlan_find_mac(vxlan, addr); 906 if (!f) 907 goto out; 908 909 if (!vxlan_addr_any(&ip)) { 910 rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex); 911 if (!rd) 912 goto out; 913 } 914 915 err = 0; 916 917 /* remove a destination if it's not the only one on the list, 918 * otherwise destroy the fdb entry 919 */ 920 if (rd && !list_is_singular(&f->remotes)) { 921 list_del_rcu(&rd->list); 922 vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH); 923 kfree_rcu(rd, rcu); 924 goto out; 925 } 926 927 vxlan_fdb_destroy(vxlan, f); 928 929out: 930 spin_unlock_bh(&vxlan->hash_lock); 931 932 return err; 933} 934 935/* Dump forwarding table */ 936static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, 937 struct net_device *dev, 938 struct net_device *filter_dev, int idx) 939{ 940 struct vxlan_dev *vxlan = netdev_priv(dev); 941 unsigned int h; 942 943 for (h = 0; h < FDB_HASH_SIZE; ++h) { 944 struct vxlan_fdb *f; 945 int err; 946 947 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { 948 struct vxlan_rdst *rd; 949 950 if (idx < cb->args[0]) 951 goto skip; 952 953 list_for_each_entry_rcu(rd, &f->remotes, list) { 954 err = vxlan_fdb_info(skb, vxlan, f, 955 NETLINK_CB(cb->skb).portid, 956 cb->nlh->nlmsg_seq, 957 RTM_NEWNEIGH, 958 NLM_F_MULTI, rd); 959 if (err < 0) 960 goto out; 961 } 962skip: 963 ++idx; 964 } 965 } 966out: 967 return idx; 968} 969 970/* Watch incoming packets to learn mapping between Ethernet address 971 * and Tunnel endpoint. 972 * Return true if packet is bogus and should be droppped. 973 */ 974static bool vxlan_snoop(struct net_device *dev, 975 union vxlan_addr *src_ip, const u8 *src_mac) 976{ 977 struct vxlan_dev *vxlan = netdev_priv(dev); 978 struct vxlan_fdb *f; 979 980 f = vxlan_find_mac(vxlan, src_mac); 981 if (likely(f)) { 982 struct vxlan_rdst *rdst = first_remote_rcu(f); 983 984 if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip))) 985 return false; 986 987 /* Don't migrate static entries, drop packets */ 988 if (f->state & NUD_NOARP) 989 return true; 990 991 if (net_ratelimit()) 992 netdev_info(dev, 993 "%pM migrated from %pIS to %pIS\n", 994 src_mac, &rdst->remote_ip, &src_ip); 995 996 rdst->remote_ip = *src_ip; 997 f->updated = jiffies; 998 vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH); 999 } else { 1000 /* learned new entry */ 1001 spin_lock(&vxlan->hash_lock); 1002 1003 /* close off race between vxlan_flush and incoming packets */ 1004 if (netif_running(dev)) 1005 vxlan_fdb_create(vxlan, src_mac, src_ip, 1006 NUD_REACHABLE, 1007 NLM_F_EXCL|NLM_F_CREATE, 1008 vxlan->dst_port, 1009 vxlan->default_dst.remote_vni, 1010 0, NTF_SELF); 1011 spin_unlock(&vxlan->hash_lock); 1012 } 1013 1014 return false; 1015} 1016 1017/* See if multicast group is already in use by other ID */ 1018static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) 1019{ 1020 struct vxlan_dev *vxlan; 1021 1022 /* The vxlan_sock is only used by dev, leaving group has 1023 * no effect on other vxlan devices. 1024 */ 1025 if (atomic_read(&dev->vn_sock->refcnt) == 1) 1026 return false; 1027 1028 list_for_each_entry(vxlan, &vn->vxlan_list, next) { 1029 if (!netif_running(vxlan->dev) || vxlan == dev) 1030 continue; 1031 1032 if (vxlan->vn_sock != dev->vn_sock) 1033 continue; 1034 1035 if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip, 1036 &dev->default_dst.remote_ip)) 1037 continue; 1038 1039 if (vxlan->default_dst.remote_ifindex != 1040 dev->default_dst.remote_ifindex) 1041 continue; 1042 1043 return true; 1044 } 1045 1046 return false; 1047} 1048 1049static void vxlan_sock_hold(struct vxlan_sock *vs) 1050{ 1051 atomic_inc(&vs->refcnt); 1052} 1053 1054void vxlan_sock_release(struct vxlan_sock *vs) 1055{ 1056 struct sock *sk = vs->sock->sk; 1057 struct net *net = sock_net(sk); 1058 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 1059 1060 if (!atomic_dec_and_test(&vs->refcnt)) 1061 return; 1062 1063 spin_lock(&vn->sock_lock); 1064 hlist_del_rcu(&vs->hlist); 1065 rcu_assign_sk_user_data(vs->sock->sk, NULL); 1066 vxlan_notify_del_rx_port(vs); 1067 spin_unlock(&vn->sock_lock); 1068 1069 queue_work(vxlan_wq, &vs->del_work); 1070} 1071EXPORT_SYMBOL_GPL(vxlan_sock_release); 1072 1073/* Callback to update multicast group membership when first VNI on 1074 * multicast asddress is brought up 1075 * Done as workqueue because ip_mc_join_group acquires RTNL. 1076 */ 1077static void vxlan_igmp_join(struct work_struct *work) 1078{ 1079 struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join); 1080 struct vxlan_sock *vs = vxlan->vn_sock; 1081 struct sock *sk = vs->sock->sk; 1082 union vxlan_addr *ip = &vxlan->default_dst.remote_ip; 1083 int ifindex = vxlan->default_dst.remote_ifindex; 1084 1085 lock_sock(sk); 1086 if (ip->sa.sa_family == AF_INET) { 1087 struct ip_mreqn mreq = { 1088 .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, 1089 .imr_ifindex = ifindex, 1090 }; 1091 1092 ip_mc_join_group(sk, &mreq); 1093#if IS_ENABLED(CONFIG_IPV6) 1094 } else { 1095 ipv6_stub->ipv6_sock_mc_join(sk, ifindex, 1096 &ip->sin6.sin6_addr); 1097#endif 1098 } 1099 release_sock(sk); 1100 1101 vxlan_sock_release(vs); 1102 dev_put(vxlan->dev); 1103} 1104 1105/* Inverse of vxlan_igmp_join when last VNI is brought down */ 1106static void vxlan_igmp_leave(struct work_struct *work) 1107{ 1108 struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave); 1109 struct vxlan_sock *vs = vxlan->vn_sock; 1110 struct sock *sk = vs->sock->sk; 1111 union vxlan_addr *ip = &vxlan->default_dst.remote_ip; 1112 int ifindex = vxlan->default_dst.remote_ifindex; 1113 1114 lock_sock(sk); 1115 if (ip->sa.sa_family == AF_INET) { 1116 struct ip_mreqn mreq = { 1117 .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, 1118 .imr_ifindex = ifindex, 1119 }; 1120 1121 ip_mc_leave_group(sk, &mreq); 1122#if IS_ENABLED(CONFIG_IPV6) 1123 } else { 1124 ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, 1125 &ip->sin6.sin6_addr); 1126#endif 1127 } 1128 1129 release_sock(sk); 1130 1131 vxlan_sock_release(vs); 1132 dev_put(vxlan->dev); 1133} 1134 1135/* Callback from net/ipv4/udp.c to receive packets */ 1136static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) 1137{ 1138 struct vxlan_sock *vs; 1139 struct vxlanhdr *vxh; 1140 1141 /* Need Vxlan and inner Ethernet header to be present */ 1142 if (!pskb_may_pull(skb, VXLAN_HLEN)) 1143 goto error; 1144 1145 /* Return packets with reserved bits set */ 1146 vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); 1147 if (vxh->vx_flags != htonl(VXLAN_FLAGS) || 1148 (vxh->vx_vni & htonl(0xff))) { 1149 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", 1150 ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); 1151 goto error; 1152 } 1153 1154 if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) 1155 goto drop; 1156 1157 vs = rcu_dereference_sk_user_data(sk); 1158 if (!vs) 1159 goto drop; 1160 1161 skb_pop_rcv_encapsulation(skb); 1162 1163 vs->rcv(vs, skb, vxh->vx_vni); 1164 return 0; 1165 1166drop: 1167 /* Consume bad packet */ 1168 kfree_skb(skb); 1169 return 0; 1170 1171error: 1172 /* Return non vxlan pkt */ 1173 return 1; 1174} 1175 1176static void vxlan_rcv(struct vxlan_sock *vs, 1177 struct sk_buff *skb, __be32 vx_vni) 1178{ 1179 struct iphdr *oip = NULL; 1180 struct ipv6hdr *oip6 = NULL; 1181 struct vxlan_dev *vxlan; 1182 struct pcpu_sw_netstats *stats; 1183 union vxlan_addr saddr; 1184 __u32 vni; 1185 int err = 0; 1186 union vxlan_addr *remote_ip; 1187 1188 vni = ntohl(vx_vni) >> 8; 1189 /* Is this VNI defined? */ 1190 vxlan = vxlan_vs_find_vni(vs, vni); 1191 if (!vxlan) 1192 goto drop; 1193 1194 remote_ip = &vxlan->default_dst.remote_ip; 1195 skb_reset_mac_header(skb); 1196 skb_scrub_packet(skb, !net_eq(vxlan->net, dev_net(vxlan->dev))); 1197 skb->protocol = eth_type_trans(skb, vxlan->dev); 1198 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1199 1200 /* Ignore packet loops (and multicast echo) */ 1201 if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) 1202 goto drop; 1203 1204 /* Re-examine inner Ethernet packet */ 1205 if (remote_ip->sa.sa_family == AF_INET) { 1206 oip = ip_hdr(skb); 1207 saddr.sin.sin_addr.s_addr = oip->saddr; 1208 saddr.sa.sa_family = AF_INET; 1209#if IS_ENABLED(CONFIG_IPV6) 1210 } else { 1211 oip6 = ipv6_hdr(skb); 1212 saddr.sin6.sin6_addr = oip6->saddr; 1213 saddr.sa.sa_family = AF_INET6; 1214#endif 1215 } 1216 1217 if ((vxlan->flags & VXLAN_F_LEARN) && 1218 vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source)) 1219 goto drop; 1220 1221 skb_reset_network_header(skb); 1222 1223 if (oip6) 1224 err = IP6_ECN_decapsulate(oip6, skb); 1225 if (oip) 1226 err = IP_ECN_decapsulate(oip, skb); 1227 1228 if (unlikely(err)) { 1229 if (log_ecn_error) { 1230 if (oip6) 1231 net_info_ratelimited("non-ECT from %pI6\n", 1232 &oip6->saddr); 1233 if (oip) 1234 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 1235 &oip->saddr, oip->tos); 1236 } 1237 if (err > 1) { 1238 ++vxlan->dev->stats.rx_frame_errors; 1239 ++vxlan->dev->stats.rx_errors; 1240 goto drop; 1241 } 1242 } 1243 1244 stats = this_cpu_ptr(vxlan->dev->tstats); 1245 u64_stats_update_begin(&stats->syncp); 1246 stats->rx_packets++; 1247 stats->rx_bytes += skb->len; 1248 u64_stats_update_end(&stats->syncp); 1249 1250 netif_rx(skb); 1251 1252 return; 1253drop: 1254 /* Consume bad packet */ 1255 kfree_skb(skb); 1256} 1257 1258static int arp_reduce(struct net_device *dev, struct sk_buff *skb) 1259{ 1260 struct vxlan_dev *vxlan = netdev_priv(dev); 1261 struct arphdr *parp; 1262 u8 *arpptr, *sha; 1263 __be32 sip, tip; 1264 struct neighbour *n; 1265 1266 if (dev->flags & IFF_NOARP) 1267 goto out; 1268 1269 if (!pskb_may_pull(skb, arp_hdr_len(dev))) { 1270 dev->stats.tx_dropped++; 1271 goto out; 1272 } 1273 parp = arp_hdr(skb); 1274 1275 if ((parp->ar_hrd != htons(ARPHRD_ETHER) && 1276 parp->ar_hrd != htons(ARPHRD_IEEE802)) || 1277 parp->ar_pro != htons(ETH_P_IP) || 1278 parp->ar_op != htons(ARPOP_REQUEST) || 1279 parp->ar_hln != dev->addr_len || 1280 parp->ar_pln != 4) 1281 goto out; 1282 arpptr = (u8 *)parp + sizeof(struct arphdr); 1283 sha = arpptr; 1284 arpptr += dev->addr_len; /* sha */ 1285 memcpy(&sip, arpptr, sizeof(sip)); 1286 arpptr += sizeof(sip); 1287 arpptr += dev->addr_len; /* tha */ 1288 memcpy(&tip, arpptr, sizeof(tip)); 1289 1290 if (ipv4_is_loopback(tip) || 1291 ipv4_is_multicast(tip)) 1292 goto out; 1293 1294 n = neigh_lookup(&arp_tbl, &tip, dev); 1295 1296 if (n) { 1297 struct vxlan_fdb *f; 1298 struct sk_buff *reply; 1299 1300 if (!(n->nud_state & NUD_CONNECTED)) { 1301 neigh_release(n); 1302 goto out; 1303 } 1304 1305 f = vxlan_find_mac(vxlan, n->ha); 1306 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { 1307 /* bridge-local neighbor */ 1308 neigh_release(n); 1309 goto out; 1310 } 1311 1312 reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, 1313 n->ha, sha); 1314 1315 neigh_release(n); 1316 1317 if (reply == NULL) 1318 goto out; 1319 1320 skb_reset_mac_header(reply); 1321 __skb_pull(reply, skb_network_offset(reply)); 1322 reply->ip_summed = CHECKSUM_UNNECESSARY; 1323 reply->pkt_type = PACKET_HOST; 1324 1325 if (netif_rx_ni(reply) == NET_RX_DROP) 1326 dev->stats.rx_dropped++; 1327 } else if (vxlan->flags & VXLAN_F_L3MISS) { 1328 union vxlan_addr ipa = { 1329 .sin.sin_addr.s_addr = tip, 1330 .sa.sa_family = AF_INET, 1331 }; 1332 1333 vxlan_ip_miss(dev, &ipa); 1334 } 1335out: 1336 consume_skb(skb); 1337 return NETDEV_TX_OK; 1338} 1339 1340#if IS_ENABLED(CONFIG_IPV6) 1341 1342static struct sk_buff *vxlan_na_create(struct sk_buff *request, 1343 struct neighbour *n, bool isrouter) 1344{ 1345 struct net_device *dev = request->dev; 1346 struct sk_buff *reply; 1347 struct nd_msg *ns, *na; 1348 struct ipv6hdr *pip6; 1349 u8 *daddr; 1350 int na_olen = 8; /* opt hdr + ETH_ALEN for target */ 1351 int ns_olen; 1352 int i, len; 1353 1354 if (dev == NULL) 1355 return NULL; 1356 1357 len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + 1358 sizeof(*na) + na_olen + dev->needed_tailroom; 1359 reply = alloc_skb(len, GFP_ATOMIC); 1360 if (reply == NULL) 1361 return NULL; 1362 1363 reply->protocol = htons(ETH_P_IPV6); 1364 reply->dev = dev; 1365 skb_reserve(reply, LL_RESERVED_SPACE(request->dev)); 1366 skb_push(reply, sizeof(struct ethhdr)); 1367 skb_set_mac_header(reply, 0); 1368 1369 ns = (struct nd_msg *)skb_transport_header(request); 1370 1371 daddr = eth_hdr(request)->h_source; 1372 ns_olen = request->len - skb_transport_offset(request) - sizeof(*ns); 1373 for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { 1374 if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { 1375 daddr = ns->opt + i + sizeof(struct nd_opt_hdr); 1376 break; 1377 } 1378 } 1379 1380 /* Ethernet header */ 1381 ether_addr_copy(eth_hdr(reply)->h_dest, daddr); 1382 ether_addr_copy(eth_hdr(reply)->h_source, n->ha); 1383 eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); 1384 reply->protocol = htons(ETH_P_IPV6); 1385 1386 skb_pull(reply, sizeof(struct ethhdr)); 1387 skb_set_network_header(reply, 0); 1388 skb_put(reply, sizeof(struct ipv6hdr)); 1389 1390 /* IPv6 header */ 1391 1392 pip6 = ipv6_hdr(reply); 1393 memset(pip6, 0, sizeof(struct ipv6hdr)); 1394 pip6->version = 6; 1395 pip6->priority = ipv6_hdr(request)->priority; 1396 pip6->nexthdr = IPPROTO_ICMPV6; 1397 pip6->hop_limit = 255; 1398 pip6->daddr = ipv6_hdr(request)->saddr; 1399 pip6->saddr = *(struct in6_addr *)n->primary_key; 1400 1401 skb_pull(reply, sizeof(struct ipv6hdr)); 1402 skb_set_transport_header(reply, 0); 1403 1404 na = (struct nd_msg *)skb_put(reply, sizeof(*na) + na_olen); 1405 1406 /* Neighbor Advertisement */ 1407 memset(na, 0, sizeof(*na)+na_olen); 1408 na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; 1409 na->icmph.icmp6_router = isrouter; 1410 na->icmph.icmp6_override = 1; 1411 na->icmph.icmp6_solicited = 1; 1412 na->target = ns->target; 1413 ether_addr_copy(&na->opt[2], n->ha); 1414 na->opt[0] = ND_OPT_TARGET_LL_ADDR; 1415 na->opt[1] = na_olen >> 3; 1416 1417 na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, 1418 &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6, 1419 csum_partial(na, sizeof(*na)+na_olen, 0)); 1420 1421 pip6->payload_len = htons(sizeof(*na)+na_olen); 1422 1423 skb_push(reply, sizeof(struct ipv6hdr)); 1424 1425 reply->ip_summed = CHECKSUM_UNNECESSARY; 1426 1427 return reply; 1428} 1429 1430static int neigh_reduce(struct net_device *dev, struct sk_buff *skb) 1431{ 1432 struct vxlan_dev *vxlan = netdev_priv(dev); 1433 struct nd_msg *msg; 1434 const struct ipv6hdr *iphdr; 1435 const struct in6_addr *saddr, *daddr; 1436 struct neighbour *n; 1437 struct inet6_dev *in6_dev; 1438 1439 in6_dev = __in6_dev_get(dev); 1440 if (!in6_dev) 1441 goto out; 1442 1443 if (!pskb_may_pull(skb, skb->len)) 1444 goto out; 1445 1446 iphdr = ipv6_hdr(skb); 1447 saddr = &iphdr->saddr; 1448 daddr = &iphdr->daddr; 1449 1450 msg = (struct nd_msg *)skb_transport_header(skb); 1451 if (msg->icmph.icmp6_code != 0 || 1452 msg->icmph.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION) 1453 goto out; 1454 1455 if (ipv6_addr_loopback(daddr) || 1456 ipv6_addr_is_multicast(&msg->target)) 1457 goto out; 1458 1459 n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev); 1460 1461 if (n) { 1462 struct vxlan_fdb *f; 1463 struct sk_buff *reply; 1464 1465 if (!(n->nud_state & NUD_CONNECTED)) { 1466 neigh_release(n); 1467 goto out; 1468 } 1469 1470 f = vxlan_find_mac(vxlan, n->ha); 1471 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { 1472 /* bridge-local neighbor */ 1473 neigh_release(n); 1474 goto out; 1475 } 1476 1477 reply = vxlan_na_create(skb, n, 1478 !!(f ? f->flags & NTF_ROUTER : 0)); 1479 1480 neigh_release(n); 1481 1482 if (reply == NULL) 1483 goto out; 1484 1485 if (netif_rx_ni(reply) == NET_RX_DROP) 1486 dev->stats.rx_dropped++; 1487 1488 } else if (vxlan->flags & VXLAN_F_L3MISS) { 1489 union vxlan_addr ipa = { 1490 .sin6.sin6_addr = msg->target, 1491 .sa.sa_family = AF_INET6, 1492 }; 1493 1494 vxlan_ip_miss(dev, &ipa); 1495 } 1496 1497out: 1498 consume_skb(skb); 1499 return NETDEV_TX_OK; 1500} 1501#endif 1502 1503static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) 1504{ 1505 struct vxlan_dev *vxlan = netdev_priv(dev); 1506 struct neighbour *n; 1507 1508 if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) 1509 return false; 1510 1511 n = NULL; 1512 switch (ntohs(eth_hdr(skb)->h_proto)) { 1513 case ETH_P_IP: 1514 { 1515 struct iphdr *pip; 1516 1517 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 1518 return false; 1519 pip = ip_hdr(skb); 1520 n = neigh_lookup(&arp_tbl, &pip->daddr, dev); 1521 if (!n && (vxlan->flags & VXLAN_F_L3MISS)) { 1522 union vxlan_addr ipa = { 1523 .sin.sin_addr.s_addr = pip->daddr, 1524 .sa.sa_family = AF_INET, 1525 }; 1526 1527 vxlan_ip_miss(dev, &ipa); 1528 return false; 1529 } 1530 1531 break; 1532 } 1533#if IS_ENABLED(CONFIG_IPV6) 1534 case ETH_P_IPV6: 1535 { 1536 struct ipv6hdr *pip6; 1537 1538 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 1539 return false; 1540 pip6 = ipv6_hdr(skb); 1541 n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev); 1542 if (!n && (vxlan->flags & VXLAN_F_L3MISS)) { 1543 union vxlan_addr ipa = { 1544 .sin6.sin6_addr = pip6->daddr, 1545 .sa.sa_family = AF_INET6, 1546 }; 1547 1548 vxlan_ip_miss(dev, &ipa); 1549 return false; 1550 } 1551 1552 break; 1553 } 1554#endif 1555 default: 1556 return false; 1557 } 1558 1559 if (n) { 1560 bool diff; 1561 1562 diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha); 1563 if (diff) { 1564 memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 1565 dev->addr_len); 1566 memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len); 1567 } 1568 neigh_release(n); 1569 return diff; 1570 } 1571 1572 return false; 1573} 1574 1575static inline struct sk_buff *vxlan_handle_offloads(struct sk_buff *skb, 1576 bool udp_csum) 1577{ 1578 int type = udp_csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; 1579 return iptunnel_handle_offloads(skb, udp_csum, type); 1580} 1581 1582#if IS_ENABLED(CONFIG_IPV6) 1583static int vxlan6_xmit_skb(struct vxlan_sock *vs, 1584 struct dst_entry *dst, struct sk_buff *skb, 1585 struct net_device *dev, struct in6_addr *saddr, 1586 struct in6_addr *daddr, __u8 prio, __u8 ttl, 1587 __be16 src_port, __be16 dst_port, __be32 vni, 1588 bool xnet) 1589{ 1590 struct ipv6hdr *ip6h; 1591 struct vxlanhdr *vxh; 1592 struct udphdr *uh; 1593 int min_headroom; 1594 int err; 1595 1596 skb = vxlan_handle_offloads(skb, !udp_get_no_check6_tx(vs->sock->sk)); 1597 if (IS_ERR(skb)) 1598 return -EINVAL; 1599 1600 skb_scrub_packet(skb, xnet); 1601 1602 min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len 1603 + VXLAN_HLEN + sizeof(struct ipv6hdr) 1604 + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); 1605 1606 /* Need space for new headers (invalidates iph ptr) */ 1607 err = skb_cow_head(skb, min_headroom); 1608 if (unlikely(err)) 1609 return err; 1610 1611 if (vlan_tx_tag_present(skb)) { 1612 if (WARN_ON(!__vlan_put_tag(skb, 1613 skb->vlan_proto, 1614 vlan_tx_tag_get(skb)))) 1615 return -ENOMEM; 1616 1617 skb->vlan_tci = 0; 1618 } 1619 1620 vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); 1621 vxh->vx_flags = htonl(VXLAN_FLAGS); 1622 vxh->vx_vni = vni; 1623 1624 __skb_push(skb, sizeof(*uh)); 1625 skb_reset_transport_header(skb); 1626 uh = udp_hdr(skb); 1627 1628 uh->dest = dst_port; 1629 uh->source = src_port; 1630 1631 uh->len = htons(skb->len); 1632 1633 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); 1634 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | 1635 IPSKB_REROUTED); 1636 skb_dst_set(skb, dst); 1637 1638 udp6_set_csum(udp_get_no_check6_tx(vs->sock->sk), skb, 1639 saddr, daddr, skb->len); 1640 1641 __skb_push(skb, sizeof(*ip6h)); 1642 skb_reset_network_header(skb); 1643 ip6h = ipv6_hdr(skb); 1644 ip6h->version = 6; 1645 ip6h->priority = prio; 1646 ip6h->flow_lbl[0] = 0; 1647 ip6h->flow_lbl[1] = 0; 1648 ip6h->flow_lbl[2] = 0; 1649 ip6h->payload_len = htons(skb->len); 1650 ip6h->nexthdr = IPPROTO_UDP; 1651 ip6h->hop_limit = ttl; 1652 ip6h->daddr = *daddr; 1653 ip6h->saddr = *saddr; 1654 1655 ip6tunnel_xmit(skb, dev); 1656 return 0; 1657} 1658#endif 1659 1660int vxlan_xmit_skb(struct vxlan_sock *vs, 1661 struct rtable *rt, struct sk_buff *skb, 1662 __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, 1663 __be16 src_port, __be16 dst_port, __be32 vni, bool xnet) 1664{ 1665 struct vxlanhdr *vxh; 1666 struct udphdr *uh; 1667 int min_headroom; 1668 int err; 1669 1670 skb = vxlan_handle_offloads(skb, !vs->sock->sk->sk_no_check_tx); 1671 if (IS_ERR(skb)) 1672 return -EINVAL; 1673 1674 min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len 1675 + VXLAN_HLEN + sizeof(struct iphdr) 1676 + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); 1677 1678 /* Need space for new headers (invalidates iph ptr) */ 1679 err = skb_cow_head(skb, min_headroom); 1680 if (unlikely(err)) 1681 return err; 1682 1683 if (vlan_tx_tag_present(skb)) { 1684 if (WARN_ON(!__vlan_put_tag(skb, 1685 skb->vlan_proto, 1686 vlan_tx_tag_get(skb)))) 1687 return -ENOMEM; 1688 1689 skb->vlan_tci = 0; 1690 } 1691 1692 vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); 1693 vxh->vx_flags = htonl(VXLAN_FLAGS); 1694 vxh->vx_vni = vni; 1695 1696 __skb_push(skb, sizeof(*uh)); 1697 skb_reset_transport_header(skb); 1698 uh = udp_hdr(skb); 1699 1700 uh->dest = dst_port; 1701 uh->source = src_port; 1702 1703 uh->len = htons(skb->len); 1704 1705 udp_set_csum(vs->sock->sk->sk_no_check_tx, skb, 1706 src, dst, skb->len); 1707 1708 return iptunnel_xmit(vs->sock->sk, rt, skb, src, dst, IPPROTO_UDP, 1709 tos, ttl, df, xnet); 1710} 1711EXPORT_SYMBOL_GPL(vxlan_xmit_skb); 1712 1713/* Bypass encapsulation if the destination is local */ 1714static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, 1715 struct vxlan_dev *dst_vxlan) 1716{ 1717 struct pcpu_sw_netstats *tx_stats, *rx_stats; 1718 union vxlan_addr loopback; 1719 union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; 1720 1721 tx_stats = this_cpu_ptr(src_vxlan->dev->tstats); 1722 rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats); 1723 skb->pkt_type = PACKET_HOST; 1724 skb->encapsulation = 0; 1725 skb->dev = dst_vxlan->dev; 1726 __skb_pull(skb, skb_network_offset(skb)); 1727 1728 if (remote_ip->sa.sa_family == AF_INET) { 1729 loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); 1730 loopback.sa.sa_family = AF_INET; 1731#if IS_ENABLED(CONFIG_IPV6) 1732 } else { 1733 loopback.sin6.sin6_addr = in6addr_loopback; 1734 loopback.sa.sa_family = AF_INET6; 1735#endif 1736 } 1737 1738 if (dst_vxlan->flags & VXLAN_F_LEARN) 1739 vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source); 1740 1741 u64_stats_update_begin(&tx_stats->syncp); 1742 tx_stats->tx_packets++; 1743 tx_stats->tx_bytes += skb->len; 1744 u64_stats_update_end(&tx_stats->syncp); 1745 1746 if (netif_rx(skb) == NET_RX_SUCCESS) { 1747 u64_stats_update_begin(&rx_stats->syncp); 1748 rx_stats->rx_packets++; 1749 rx_stats->rx_bytes += skb->len; 1750 u64_stats_update_end(&rx_stats->syncp); 1751 } else { 1752 skb->dev->stats.rx_dropped++; 1753 } 1754} 1755 1756static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, 1757 struct vxlan_rdst *rdst, bool did_rsc) 1758{ 1759 struct vxlan_dev *vxlan = netdev_priv(dev); 1760 struct rtable *rt = NULL; 1761 const struct iphdr *old_iph; 1762 struct flowi4 fl4; 1763 union vxlan_addr *dst; 1764 __be16 src_port = 0, dst_port; 1765 u32 vni; 1766 __be16 df = 0; 1767 __u8 tos, ttl; 1768 int err; 1769 1770 dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port; 1771 vni = rdst->remote_vni; 1772 dst = &rdst->remote_ip; 1773 1774 if (vxlan_addr_any(dst)) { 1775 if (did_rsc) { 1776 /* short-circuited back to local bridge */ 1777 vxlan_encap_bypass(skb, vxlan, vxlan); 1778 return; 1779 } 1780 goto drop; 1781 } 1782 1783 old_iph = ip_hdr(skb); 1784 1785 ttl = vxlan->ttl; 1786 if (!ttl && vxlan_addr_multicast(dst)) 1787 ttl = 1; 1788 1789 tos = vxlan->tos; 1790 if (tos == 1) 1791 tos = ip_tunnel_get_dsfield(old_iph, skb); 1792 1793 src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->port_min, 1794 vxlan->port_max, true); 1795 1796 if (dst->sa.sa_family == AF_INET) { 1797 memset(&fl4, 0, sizeof(fl4)); 1798 fl4.flowi4_oif = rdst->remote_ifindex; 1799 fl4.flowi4_tos = RT_TOS(tos); 1800 fl4.daddr = dst->sin.sin_addr.s_addr; 1801 fl4.saddr = vxlan->saddr.sin.sin_addr.s_addr; 1802 1803 rt = ip_route_output_key(vxlan->net, &fl4); 1804 if (IS_ERR(rt)) { 1805 netdev_dbg(dev, "no route to %pI4\n", 1806 &dst->sin.sin_addr.s_addr); 1807 dev->stats.tx_carrier_errors++; 1808 goto tx_error; 1809 } 1810 1811 if (rt->dst.dev == dev) { 1812 netdev_dbg(dev, "circular route to %pI4\n", 1813 &dst->sin.sin_addr.s_addr); 1814 dev->stats.collisions++; 1815 goto rt_tx_error; 1816 } 1817 1818 /* Bypass encapsulation if the destination is local */ 1819 if (rt->rt_flags & RTCF_LOCAL && 1820 !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { 1821 struct vxlan_dev *dst_vxlan; 1822 1823 ip_rt_put(rt); 1824 dst_vxlan = vxlan_find_vni(vxlan->net, vni, dst_port); 1825 if (!dst_vxlan) 1826 goto tx_error; 1827 vxlan_encap_bypass(skb, vxlan, dst_vxlan); 1828 return; 1829 } 1830 1831 tos = ip_tunnel_ecn_encap(tos, old_iph, skb); 1832 ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); 1833 1834 err = vxlan_xmit_skb(vxlan->vn_sock, rt, skb, 1835 fl4.saddr, dst->sin.sin_addr.s_addr, 1836 tos, ttl, df, src_port, dst_port, 1837 htonl(vni << 8), 1838 !net_eq(vxlan->net, dev_net(vxlan->dev))); 1839 1840 if (err < 0) 1841 goto rt_tx_error; 1842 iptunnel_xmit_stats(err, &dev->stats, dev->tstats); 1843#if IS_ENABLED(CONFIG_IPV6) 1844 } else { 1845 struct sock *sk = vxlan->vn_sock->sock->sk; 1846 struct dst_entry *ndst; 1847 struct flowi6 fl6; 1848 u32 flags; 1849 1850 memset(&fl6, 0, sizeof(fl6)); 1851 fl6.flowi6_oif = rdst->remote_ifindex; 1852 fl6.daddr = dst->sin6.sin6_addr; 1853 fl6.saddr = vxlan->saddr.sin6.sin6_addr; 1854 fl6.flowi6_proto = IPPROTO_UDP; 1855 1856 if (ipv6_stub->ipv6_dst_lookup(sk, &ndst, &fl6)) { 1857 netdev_dbg(dev, "no route to %pI6\n", 1858 &dst->sin6.sin6_addr); 1859 dev->stats.tx_carrier_errors++; 1860 goto tx_error; 1861 } 1862 1863 if (ndst->dev == dev) { 1864 netdev_dbg(dev, "circular route to %pI6\n", 1865 &dst->sin6.sin6_addr); 1866 dst_release(ndst); 1867 dev->stats.collisions++; 1868 goto tx_error; 1869 } 1870 1871 /* Bypass encapsulation if the destination is local */ 1872 flags = ((struct rt6_info *)ndst)->rt6i_flags; 1873 if (flags & RTF_LOCAL && 1874 !(flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { 1875 struct vxlan_dev *dst_vxlan; 1876 1877 dst_release(ndst); 1878 dst_vxlan = vxlan_find_vni(vxlan->net, vni, dst_port); 1879 if (!dst_vxlan) 1880 goto tx_error; 1881 vxlan_encap_bypass(skb, vxlan, dst_vxlan); 1882 return; 1883 } 1884 1885 ttl = ttl ? : ip6_dst_hoplimit(ndst); 1886 1887 err = vxlan6_xmit_skb(vxlan->vn_sock, ndst, skb, 1888 dev, &fl6.saddr, &fl6.daddr, 0, ttl, 1889 src_port, dst_port, htonl(vni << 8), 1890 !net_eq(vxlan->net, dev_net(vxlan->dev))); 1891#endif 1892 } 1893 1894 return; 1895 1896drop: 1897 dev->stats.tx_dropped++; 1898 goto tx_free; 1899 1900rt_tx_error: 1901 ip_rt_put(rt); 1902tx_error: 1903 dev->stats.tx_errors++; 1904tx_free: 1905 dev_kfree_skb(skb); 1906} 1907 1908/* Transmit local packets over Vxlan 1909 * 1910 * Outer IP header inherits ECN and DF from inner header. 1911 * Outer UDP destination is the VXLAN assigned port. 1912 * source port is based on hash of flow 1913 */ 1914static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) 1915{ 1916 struct vxlan_dev *vxlan = netdev_priv(dev); 1917 struct ethhdr *eth; 1918 bool did_rsc = false; 1919 struct vxlan_rdst *rdst, *fdst = NULL; 1920 struct vxlan_fdb *f; 1921 1922 skb_reset_mac_header(skb); 1923 eth = eth_hdr(skb); 1924 1925 if ((vxlan->flags & VXLAN_F_PROXY)) { 1926 if (ntohs(eth->h_proto) == ETH_P_ARP) 1927 return arp_reduce(dev, skb); 1928#if IS_ENABLED(CONFIG_IPV6) 1929 else if (ntohs(eth->h_proto) == ETH_P_IPV6 && 1930 skb->len >= sizeof(struct ipv6hdr) + sizeof(struct nd_msg) && 1931 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { 1932 struct nd_msg *msg; 1933 1934 msg = (struct nd_msg *)skb_transport_header(skb); 1935 if (msg->icmph.icmp6_code == 0 && 1936 msg->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) 1937 return neigh_reduce(dev, skb); 1938 } 1939#endif 1940 } 1941 1942 f = vxlan_find_mac(vxlan, eth->h_dest); 1943 did_rsc = false; 1944 1945 if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) && 1946 (ntohs(eth->h_proto) == ETH_P_IP || 1947 ntohs(eth->h_proto) == ETH_P_IPV6)) { 1948 did_rsc = route_shortcircuit(dev, skb); 1949 if (did_rsc) 1950 f = vxlan_find_mac(vxlan, eth->h_dest); 1951 } 1952 1953 if (f == NULL) { 1954 f = vxlan_find_mac(vxlan, all_zeros_mac); 1955 if (f == NULL) { 1956 if ((vxlan->flags & VXLAN_F_L2MISS) && 1957 !is_multicast_ether_addr(eth->h_dest)) 1958 vxlan_fdb_miss(vxlan, eth->h_dest); 1959 1960 dev->stats.tx_dropped++; 1961 kfree_skb(skb); 1962 return NETDEV_TX_OK; 1963 } 1964 } 1965 1966 list_for_each_entry_rcu(rdst, &f->remotes, list) { 1967 struct sk_buff *skb1; 1968 1969 if (!fdst) { 1970 fdst = rdst; 1971 continue; 1972 } 1973 skb1 = skb_clone(skb, GFP_ATOMIC); 1974 if (skb1) 1975 vxlan_xmit_one(skb1, dev, rdst, did_rsc); 1976 } 1977 1978 if (fdst) 1979 vxlan_xmit_one(skb, dev, fdst, did_rsc); 1980 else 1981 kfree_skb(skb); 1982 return NETDEV_TX_OK; 1983} 1984 1985/* Walk the forwarding table and purge stale entries */ 1986static void vxlan_cleanup(unsigned long arg) 1987{ 1988 struct vxlan_dev *vxlan = (struct vxlan_dev *) arg; 1989 unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; 1990 unsigned int h; 1991 1992 if (!netif_running(vxlan->dev)) 1993 return; 1994 1995 spin_lock_bh(&vxlan->hash_lock); 1996 for (h = 0; h < FDB_HASH_SIZE; ++h) { 1997 struct hlist_node *p, *n; 1998 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { 1999 struct vxlan_fdb *f 2000 = container_of(p, struct vxlan_fdb, hlist); 2001 unsigned long timeout; 2002 2003 if (f->state & NUD_PERMANENT) 2004 continue; 2005 2006 timeout = f->used + vxlan->age_interval * HZ; 2007 if (time_before_eq(timeout, jiffies)) { 2008 netdev_dbg(vxlan->dev, 2009 "garbage collect %pM\n", 2010 f->eth_addr); 2011 f->state = NUD_STALE; 2012 vxlan_fdb_destroy(vxlan, f); 2013 } else if (time_before(timeout, next_timer)) 2014 next_timer = timeout; 2015 } 2016 } 2017 spin_unlock_bh(&vxlan->hash_lock); 2018 2019 mod_timer(&vxlan->age_timer, next_timer); 2020} 2021 2022static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan) 2023{ 2024 __u32 vni = vxlan->default_dst.remote_vni; 2025 2026 vxlan->vn_sock = vs; 2027 hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni)); 2028} 2029 2030/* Setup stats when device is created */ 2031static int vxlan_init(struct net_device *dev) 2032{ 2033 struct vxlan_dev *vxlan = netdev_priv(dev); 2034 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2035 struct vxlan_sock *vs; 2036 2037 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 2038 if (!dev->tstats) 2039 return -ENOMEM; 2040 2041 spin_lock(&vn->sock_lock); 2042 vs = vxlan_find_sock(vxlan->net, vxlan->dst_port); 2043 if (vs) { 2044 /* If we have a socket with same port already, reuse it */ 2045 atomic_inc(&vs->refcnt); 2046 vxlan_vs_add_dev(vs, vxlan); 2047 } else { 2048 /* otherwise make new socket outside of RTNL */ 2049 dev_hold(dev); 2050 queue_work(vxlan_wq, &vxlan->sock_work); 2051 } 2052 spin_unlock(&vn->sock_lock); 2053 2054 return 0; 2055} 2056 2057static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan) 2058{ 2059 struct vxlan_fdb *f; 2060 2061 spin_lock_bh(&vxlan->hash_lock); 2062 f = __vxlan_find_mac(vxlan, all_zeros_mac); 2063 if (f) 2064 vxlan_fdb_destroy(vxlan, f); 2065 spin_unlock_bh(&vxlan->hash_lock); 2066} 2067 2068static void vxlan_uninit(struct net_device *dev) 2069{ 2070 struct vxlan_dev *vxlan = netdev_priv(dev); 2071 struct vxlan_sock *vs = vxlan->vn_sock; 2072 2073 vxlan_fdb_delete_default(vxlan); 2074 2075 if (vs) 2076 vxlan_sock_release(vs); 2077 free_percpu(dev->tstats); 2078} 2079 2080/* Start ageing timer and join group when device is brought up */ 2081static int vxlan_open(struct net_device *dev) 2082{ 2083 struct vxlan_dev *vxlan = netdev_priv(dev); 2084 struct vxlan_sock *vs = vxlan->vn_sock; 2085 2086 /* socket hasn't been created */ 2087 if (!vs) 2088 return -ENOTCONN; 2089 2090 if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { 2091 vxlan_sock_hold(vs); 2092 dev_hold(dev); 2093 queue_work(vxlan_wq, &vxlan->igmp_join); 2094 } 2095 2096 if (vxlan->age_interval) 2097 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); 2098 2099 return 0; 2100} 2101 2102/* Purge the forwarding table */ 2103static void vxlan_flush(struct vxlan_dev *vxlan) 2104{ 2105 unsigned int h; 2106 2107 spin_lock_bh(&vxlan->hash_lock); 2108 for (h = 0; h < FDB_HASH_SIZE; ++h) { 2109 struct hlist_node *p, *n; 2110 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { 2111 struct vxlan_fdb *f 2112 = container_of(p, struct vxlan_fdb, hlist); 2113 /* the all_zeros_mac entry is deleted at vxlan_uninit */ 2114 if (!is_zero_ether_addr(f->eth_addr)) 2115 vxlan_fdb_destroy(vxlan, f); 2116 } 2117 } 2118 spin_unlock_bh(&vxlan->hash_lock); 2119} 2120 2121/* Cleanup timer and forwarding table on shutdown */ 2122static int vxlan_stop(struct net_device *dev) 2123{ 2124 struct vxlan_dev *vxlan = netdev_priv(dev); 2125 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2126 struct vxlan_sock *vs = vxlan->vn_sock; 2127 2128 if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && 2129 !vxlan_group_used(vn, vxlan)) { 2130 vxlan_sock_hold(vs); 2131 dev_hold(dev); 2132 queue_work(vxlan_wq, &vxlan->igmp_leave); 2133 } 2134 2135 del_timer_sync(&vxlan->age_timer); 2136 2137 vxlan_flush(vxlan); 2138 2139 return 0; 2140} 2141 2142/* Stub, nothing needs to be done. */ 2143static void vxlan_set_multicast_list(struct net_device *dev) 2144{ 2145} 2146 2147static int vxlan_change_mtu(struct net_device *dev, int new_mtu) 2148{ 2149 struct vxlan_dev *vxlan = netdev_priv(dev); 2150 struct vxlan_rdst *dst = &vxlan->default_dst; 2151 struct net_device *lowerdev; 2152 int max_mtu; 2153 2154 lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex); 2155 if (lowerdev == NULL) 2156 return eth_change_mtu(dev, new_mtu); 2157 2158 if (dst->remote_ip.sa.sa_family == AF_INET6) 2159 max_mtu = lowerdev->mtu - VXLAN6_HEADROOM; 2160 else 2161 max_mtu = lowerdev->mtu - VXLAN_HEADROOM; 2162 2163 if (new_mtu < 68 || new_mtu > max_mtu) 2164 return -EINVAL; 2165 2166 dev->mtu = new_mtu; 2167 return 0; 2168} 2169 2170static const struct net_device_ops vxlan_netdev_ops = { 2171 .ndo_init = vxlan_init, 2172 .ndo_uninit = vxlan_uninit, 2173 .ndo_open = vxlan_open, 2174 .ndo_stop = vxlan_stop, 2175 .ndo_start_xmit = vxlan_xmit, 2176 .ndo_get_stats64 = ip_tunnel_get_stats64, 2177 .ndo_set_rx_mode = vxlan_set_multicast_list, 2178 .ndo_change_mtu = vxlan_change_mtu, 2179 .ndo_validate_addr = eth_validate_addr, 2180 .ndo_set_mac_address = eth_mac_addr, 2181 .ndo_fdb_add = vxlan_fdb_add, 2182 .ndo_fdb_del = vxlan_fdb_delete, 2183 .ndo_fdb_dump = vxlan_fdb_dump, 2184}; 2185 2186/* Info for udev, that this is a virtual tunnel endpoint */ 2187static struct device_type vxlan_type = { 2188 .name = "vxlan", 2189}; 2190 2191/* Calls the ndo_add_vxlan_port of the caller in order to 2192 * supply the listening VXLAN udp ports. Callers are expected 2193 * to implement the ndo_add_vxlan_port. 2194 */ 2195void vxlan_get_rx_port(struct net_device *dev) 2196{ 2197 struct vxlan_sock *vs; 2198 struct net *net = dev_net(dev); 2199 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2200 sa_family_t sa_family; 2201 __be16 port; 2202 unsigned int i; 2203 2204 spin_lock(&vn->sock_lock); 2205 for (i = 0; i < PORT_HASH_SIZE; ++i) { 2206 hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { 2207 port = inet_sk(vs->sock->sk)->inet_sport; 2208 sa_family = vs->sock->sk->sk_family; 2209 dev->netdev_ops->ndo_add_vxlan_port(dev, sa_family, 2210 port); 2211 } 2212 } 2213 spin_unlock(&vn->sock_lock); 2214} 2215EXPORT_SYMBOL_GPL(vxlan_get_rx_port); 2216 2217/* Initialize the device structure. */ 2218static void vxlan_setup(struct net_device *dev) 2219{ 2220 struct vxlan_dev *vxlan = netdev_priv(dev); 2221 unsigned int h; 2222 2223 eth_hw_addr_random(dev); 2224 ether_setup(dev); 2225 if (vxlan->default_dst.remote_ip.sa.sa_family == AF_INET6) 2226 dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM; 2227 else 2228 dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM; 2229 2230 dev->netdev_ops = &vxlan_netdev_ops; 2231 dev->destructor = free_netdev; 2232 SET_NETDEV_DEVTYPE(dev, &vxlan_type); 2233 2234 dev->tx_queue_len = 0; 2235 dev->features |= NETIF_F_LLTX; 2236 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; 2237 dev->features |= NETIF_F_RXCSUM; 2238 dev->features |= NETIF_F_GSO_SOFTWARE; 2239 2240 dev->vlan_features = dev->features; 2241 dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; 2242 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; 2243 dev->hw_features |= NETIF_F_GSO_SOFTWARE; 2244 dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; 2245 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; 2246 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 2247 2248 INIT_LIST_HEAD(&vxlan->next); 2249 spin_lock_init(&vxlan->hash_lock); 2250 INIT_WORK(&vxlan->igmp_join, vxlan_igmp_join); 2251 INIT_WORK(&vxlan->igmp_leave, vxlan_igmp_leave); 2252 INIT_WORK(&vxlan->sock_work, vxlan_sock_work); 2253 2254 init_timer_deferrable(&vxlan->age_timer); 2255 vxlan->age_timer.function = vxlan_cleanup; 2256 vxlan->age_timer.data = (unsigned long) vxlan; 2257 2258 vxlan->dst_port = htons(vxlan_port); 2259 2260 vxlan->dev = dev; 2261 2262 for (h = 0; h < FDB_HASH_SIZE; ++h) 2263 INIT_HLIST_HEAD(&vxlan->fdb_head[h]); 2264} 2265 2266static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { 2267 [IFLA_VXLAN_ID] = { .type = NLA_U32 }, 2268 [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, 2269 [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, 2270 [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, 2271 [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, 2272 [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, 2273 [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, 2274 [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, 2275 [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, 2276 [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, 2277 [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, 2278 [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, 2279 [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, 2280 [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, 2281 [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, 2282 [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, 2283 [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, 2284}; 2285 2286static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[]) 2287{ 2288 if (tb[IFLA_ADDRESS]) { 2289 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { 2290 pr_debug("invalid link address (not ethernet)\n"); 2291 return -EINVAL; 2292 } 2293 2294 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { 2295 pr_debug("invalid all zero ethernet address\n"); 2296 return -EADDRNOTAVAIL; 2297 } 2298 } 2299 2300 if (!data) 2301 return -EINVAL; 2302 2303 if (data[IFLA_VXLAN_ID]) { 2304 __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); 2305 if (id >= VXLAN_VID_MASK) 2306 return -ERANGE; 2307 } 2308 2309 if (data[IFLA_VXLAN_PORT_RANGE]) { 2310 const struct ifla_vxlan_port_range *p 2311 = nla_data(data[IFLA_VXLAN_PORT_RANGE]); 2312 2313 if (ntohs(p->high) < ntohs(p->low)) { 2314 pr_debug("port range %u .. %u not valid\n", 2315 ntohs(p->low), ntohs(p->high)); 2316 return -EINVAL; 2317 } 2318 } 2319 2320 return 0; 2321} 2322 2323static void vxlan_get_drvinfo(struct net_device *netdev, 2324 struct ethtool_drvinfo *drvinfo) 2325{ 2326 strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); 2327 strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); 2328} 2329 2330static const struct ethtool_ops vxlan_ethtool_ops = { 2331 .get_drvinfo = vxlan_get_drvinfo, 2332 .get_link = ethtool_op_get_link, 2333}; 2334 2335static void vxlan_del_work(struct work_struct *work) 2336{ 2337 struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work); 2338 2339 sk_release_kernel(vs->sock->sk); 2340 kfree_rcu(vs, rcu); 2341} 2342 2343static struct socket *vxlan_create_sock(struct net *net, bool ipv6, 2344 __be16 port, u32 flags) 2345{ 2346 struct socket *sock; 2347 struct udp_port_cfg udp_conf; 2348 int err; 2349 2350 memset(&udp_conf, 0, sizeof(udp_conf)); 2351 2352 if (ipv6) { 2353 udp_conf.family = AF_INET6; 2354 udp_conf.use_udp6_tx_checksums = 2355 !!(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); 2356 udp_conf.use_udp6_rx_checksums = 2357 !!(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); 2358 } else { 2359 udp_conf.family = AF_INET; 2360 udp_conf.local_ip.s_addr = INADDR_ANY; 2361 udp_conf.use_udp_checksums = 2362 !!(flags & VXLAN_F_UDP_CSUM); 2363 } 2364 2365 udp_conf.local_udp_port = port; 2366 2367 /* Open UDP socket */ 2368 err = udp_sock_create(net, &udp_conf, &sock); 2369 if (err < 0) 2370 return ERR_PTR(err); 2371 2372 /* Disable multicast loopback */ 2373 inet_sk(sock->sk)->mc_loop = 0; 2374 2375 return sock; 2376} 2377 2378/* Create new listen socket if needed */ 2379static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port, 2380 vxlan_rcv_t *rcv, void *data, 2381 u32 flags) 2382{ 2383 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2384 struct vxlan_sock *vs; 2385 struct socket *sock; 2386 struct sock *sk; 2387 unsigned int h; 2388 bool ipv6 = !!(flags & VXLAN_F_IPV6); 2389 2390 vs = kzalloc(sizeof(*vs), GFP_KERNEL); 2391 if (!vs) 2392 return ERR_PTR(-ENOMEM); 2393 2394 for (h = 0; h < VNI_HASH_SIZE; ++h) 2395 INIT_HLIST_HEAD(&vs->vni_list[h]); 2396 2397 INIT_WORK(&vs->del_work, vxlan_del_work); 2398 2399 sock = vxlan_create_sock(net, ipv6, port, flags); 2400 if (IS_ERR(sock)) { 2401 kfree(vs); 2402 return ERR_CAST(sock); 2403 } 2404 2405 vs->sock = sock; 2406 sk = sock->sk; 2407 atomic_set(&vs->refcnt, 1); 2408 vs->rcv = rcv; 2409 vs->data = data; 2410 rcu_assign_sk_user_data(vs->sock->sk, vs); 2411 2412 /* Initialize the vxlan udp offloads structure */ 2413 vs->udp_offloads.port = port; 2414 vs->udp_offloads.callbacks.gro_receive = vxlan_gro_receive; 2415 vs->udp_offloads.callbacks.gro_complete = vxlan_gro_complete; 2416 2417 spin_lock(&vn->sock_lock); 2418 hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); 2419 vxlan_notify_add_rx_port(vs); 2420 spin_unlock(&vn->sock_lock); 2421 2422 /* Mark socket as an encapsulation socket. */ 2423 udp_sk(sk)->encap_type = 1; 2424 udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv; 2425#if IS_ENABLED(CONFIG_IPV6) 2426 if (ipv6) 2427 ipv6_stub->udpv6_encap_enable(); 2428 else 2429#endif 2430 udp_encap_enable(); 2431 2432 return vs; 2433} 2434 2435struct vxlan_sock *vxlan_sock_add(struct net *net, __be16 port, 2436 vxlan_rcv_t *rcv, void *data, 2437 bool no_share, u32 flags) 2438{ 2439 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2440 struct vxlan_sock *vs; 2441 2442 vs = vxlan_socket_create(net, port, rcv, data, flags); 2443 if (!IS_ERR(vs)) 2444 return vs; 2445 2446 if (no_share) /* Return error if sharing is not allowed. */ 2447 return vs; 2448 2449 spin_lock(&vn->sock_lock); 2450 vs = vxlan_find_sock(net, port); 2451 if (vs) { 2452 if (vs->rcv == rcv) 2453 atomic_inc(&vs->refcnt); 2454 else 2455 vs = ERR_PTR(-EBUSY); 2456 } 2457 spin_unlock(&vn->sock_lock); 2458 2459 if (!vs) 2460 vs = ERR_PTR(-EINVAL); 2461 2462 return vs; 2463} 2464EXPORT_SYMBOL_GPL(vxlan_sock_add); 2465 2466/* Scheduled at device creation to bind to a socket */ 2467static void vxlan_sock_work(struct work_struct *work) 2468{ 2469 struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, sock_work); 2470 struct net *net = vxlan->net; 2471 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2472 __be16 port = vxlan->dst_port; 2473 struct vxlan_sock *nvs; 2474 2475 nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags); 2476 spin_lock(&vn->sock_lock); 2477 if (!IS_ERR(nvs)) 2478 vxlan_vs_add_dev(nvs, vxlan); 2479 spin_unlock(&vn->sock_lock); 2480 2481 dev_put(vxlan->dev); 2482} 2483 2484static int vxlan_newlink(struct net *net, struct net_device *dev, 2485 struct nlattr *tb[], struct nlattr *data[]) 2486{ 2487 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2488 struct vxlan_dev *vxlan = netdev_priv(dev); 2489 struct vxlan_rdst *dst = &vxlan->default_dst; 2490 __u32 vni; 2491 int err; 2492 bool use_ipv6 = false; 2493 2494 if (!data[IFLA_VXLAN_ID]) 2495 return -EINVAL; 2496 2497 vxlan->net = dev_net(dev); 2498 2499 vni = nla_get_u32(data[IFLA_VXLAN_ID]); 2500 dst->remote_vni = vni; 2501 2502 /* Unless IPv6 is explicitly requested, assume IPv4 */ 2503 dst->remote_ip.sa.sa_family = AF_INET; 2504 if (data[IFLA_VXLAN_GROUP]) { 2505 dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]); 2506 } else if (data[IFLA_VXLAN_GROUP6]) { 2507 if (!IS_ENABLED(CONFIG_IPV6)) 2508 return -EPFNOSUPPORT; 2509 2510 nla_memcpy(&dst->remote_ip.sin6.sin6_addr, data[IFLA_VXLAN_GROUP6], 2511 sizeof(struct in6_addr)); 2512 dst->remote_ip.sa.sa_family = AF_INET6; 2513 use_ipv6 = true; 2514 } 2515 2516 if (data[IFLA_VXLAN_LOCAL]) { 2517 vxlan->saddr.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_LOCAL]); 2518 vxlan->saddr.sa.sa_family = AF_INET; 2519 } else if (data[IFLA_VXLAN_LOCAL6]) { 2520 if (!IS_ENABLED(CONFIG_IPV6)) 2521 return -EPFNOSUPPORT; 2522 2523 /* TODO: respect scope id */ 2524 nla_memcpy(&vxlan->saddr.sin6.sin6_addr, data[IFLA_VXLAN_LOCAL6], 2525 sizeof(struct in6_addr)); 2526 vxlan->saddr.sa.sa_family = AF_INET6; 2527 use_ipv6 = true; 2528 } 2529 2530 if (data[IFLA_VXLAN_LINK] && 2531 (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) { 2532 struct net_device *lowerdev 2533 = __dev_get_by_index(net, dst->remote_ifindex); 2534 2535 if (!lowerdev) { 2536 pr_info("ifindex %d does not exist\n", dst->remote_ifindex); 2537 return -ENODEV; 2538 } 2539 2540#if IS_ENABLED(CONFIG_IPV6) 2541 if (use_ipv6) { 2542 struct inet6_dev *idev = __in6_dev_get(lowerdev); 2543 if (idev && idev->cnf.disable_ipv6) { 2544 pr_info("IPv6 is disabled via sysctl\n"); 2545 return -EPERM; 2546 } 2547 vxlan->flags |= VXLAN_F_IPV6; 2548 } 2549#endif 2550 2551 if (!tb[IFLA_MTU]) 2552 dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); 2553 2554 dev->needed_headroom = lowerdev->hard_header_len + 2555 (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); 2556 } else if (use_ipv6) 2557 vxlan->flags |= VXLAN_F_IPV6; 2558 2559 if (data[IFLA_VXLAN_TOS]) 2560 vxlan->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); 2561 2562 if (data[IFLA_VXLAN_TTL]) 2563 vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); 2564 2565 if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING])) 2566 vxlan->flags |= VXLAN_F_LEARN; 2567 2568 if (data[IFLA_VXLAN_AGEING]) 2569 vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); 2570 else 2571 vxlan->age_interval = FDB_AGE_DEFAULT; 2572 2573 if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY])) 2574 vxlan->flags |= VXLAN_F_PROXY; 2575 2576 if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC])) 2577 vxlan->flags |= VXLAN_F_RSC; 2578 2579 if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS])) 2580 vxlan->flags |= VXLAN_F_L2MISS; 2581 2582 if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS])) 2583 vxlan->flags |= VXLAN_F_L3MISS; 2584 2585 if (data[IFLA_VXLAN_LIMIT]) 2586 vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); 2587 2588 if (data[IFLA_VXLAN_PORT_RANGE]) { 2589 const struct ifla_vxlan_port_range *p 2590 = nla_data(data[IFLA_VXLAN_PORT_RANGE]); 2591 vxlan->port_min = ntohs(p->low); 2592 vxlan->port_max = ntohs(p->high); 2593 } 2594 2595 if (data[IFLA_VXLAN_PORT]) 2596 vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); 2597 2598 if (data[IFLA_VXLAN_UDP_CSUM] && nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) 2599 vxlan->flags |= VXLAN_F_UDP_CSUM; 2600 2601 if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX] && 2602 nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX])) 2603 vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_TX; 2604 2605 if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX] && 2606 nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) 2607 vxlan->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; 2608 2609 if (vxlan_find_vni(net, vni, vxlan->dst_port)) { 2610 pr_info("duplicate VNI %u\n", vni); 2611 return -EEXIST; 2612 } 2613 2614 dev->ethtool_ops = &vxlan_ethtool_ops; 2615 2616 /* create an fdb entry for a valid default destination */ 2617 if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { 2618 err = vxlan_fdb_create(vxlan, all_zeros_mac, 2619 &vxlan->default_dst.remote_ip, 2620 NUD_REACHABLE|NUD_PERMANENT, 2621 NLM_F_EXCL|NLM_F_CREATE, 2622 vxlan->dst_port, 2623 vxlan->default_dst.remote_vni, 2624 vxlan->default_dst.remote_ifindex, 2625 NTF_SELF); 2626 if (err) 2627 return err; 2628 } 2629 2630 err = register_netdevice(dev); 2631 if (err) { 2632 vxlan_fdb_delete_default(vxlan); 2633 return err; 2634 } 2635 2636 list_add(&vxlan->next, &vn->vxlan_list); 2637 2638 return 0; 2639} 2640 2641static void vxlan_dellink(struct net_device *dev, struct list_head *head) 2642{ 2643 struct vxlan_dev *vxlan = netdev_priv(dev); 2644 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2645 2646 spin_lock(&vn->sock_lock); 2647 if (!hlist_unhashed(&vxlan->hlist)) 2648 hlist_del_rcu(&vxlan->hlist); 2649 spin_unlock(&vn->sock_lock); 2650 2651 list_del(&vxlan->next); 2652 unregister_netdevice_queue(dev, head); 2653} 2654 2655static size_t vxlan_get_size(const struct net_device *dev) 2656{ 2657 2658 return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ 2659 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ 2660 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ 2661 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ 2662 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ 2663 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ 2664 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ 2665 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ 2666 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ 2667 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ 2668 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ 2669 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ 2670 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ 2671 nla_total_size(sizeof(struct ifla_vxlan_port_range)) + 2672 nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ 2673 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ 2674 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ 2675 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ 2676 0; 2677} 2678 2679static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) 2680{ 2681 const struct vxlan_dev *vxlan = netdev_priv(dev); 2682 const struct vxlan_rdst *dst = &vxlan->default_dst; 2683 struct ifla_vxlan_port_range ports = { 2684 .low = htons(vxlan->port_min), 2685 .high = htons(vxlan->port_max), 2686 }; 2687 2688 if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni)) 2689 goto nla_put_failure; 2690 2691 if (!vxlan_addr_any(&dst->remote_ip)) { 2692 if (dst->remote_ip.sa.sa_family == AF_INET) { 2693 if (nla_put_be32(skb, IFLA_VXLAN_GROUP, 2694 dst->remote_ip.sin.sin_addr.s_addr)) 2695 goto nla_put_failure; 2696#if IS_ENABLED(CONFIG_IPV6) 2697 } else { 2698 if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr), 2699 &dst->remote_ip.sin6.sin6_addr)) 2700 goto nla_put_failure; 2701#endif 2702 } 2703 } 2704 2705 if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) 2706 goto nla_put_failure; 2707 2708 if (!vxlan_addr_any(&vxlan->saddr)) { 2709 if (vxlan->saddr.sa.sa_family == AF_INET) { 2710 if (nla_put_be32(skb, IFLA_VXLAN_LOCAL, 2711 vxlan->saddr.sin.sin_addr.s_addr)) 2712 goto nla_put_failure; 2713#if IS_ENABLED(CONFIG_IPV6) 2714 } else { 2715 if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr), 2716 &vxlan->saddr.sin6.sin6_addr)) 2717 goto nla_put_failure; 2718#endif 2719 } 2720 } 2721 2722 if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) || 2723 nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) || 2724 nla_put_u8(skb, IFLA_VXLAN_LEARNING, 2725 !!(vxlan->flags & VXLAN_F_LEARN)) || 2726 nla_put_u8(skb, IFLA_VXLAN_PROXY, 2727 !!(vxlan->flags & VXLAN_F_PROXY)) || 2728 nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) || 2729 nla_put_u8(skb, IFLA_VXLAN_L2MISS, 2730 !!(vxlan->flags & VXLAN_F_L2MISS)) || 2731 nla_put_u8(skb, IFLA_VXLAN_L3MISS, 2732 !!(vxlan->flags & VXLAN_F_L3MISS)) || 2733 nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) || 2734 nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) || 2735 nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port) || 2736 nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, 2737 !!(vxlan->flags & VXLAN_F_UDP_CSUM)) || 2738 nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, 2739 !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || 2740 nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 2741 !!(vxlan->flags & VXLAN_F_UDP_ZERO_CSUM6_RX))) 2742 goto nla_put_failure; 2743 2744 if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) 2745 goto nla_put_failure; 2746 2747 return 0; 2748 2749nla_put_failure: 2750 return -EMSGSIZE; 2751} 2752 2753static struct rtnl_link_ops vxlan_link_ops __read_mostly = { 2754 .kind = "vxlan", 2755 .maxtype = IFLA_VXLAN_MAX, 2756 .policy = vxlan_policy, 2757 .priv_size = sizeof(struct vxlan_dev), 2758 .setup = vxlan_setup, 2759 .validate = vxlan_validate, 2760 .newlink = vxlan_newlink, 2761 .dellink = vxlan_dellink, 2762 .get_size = vxlan_get_size, 2763 .fill_info = vxlan_fill_info, 2764}; 2765 2766static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, 2767 struct net_device *dev) 2768{ 2769 struct vxlan_dev *vxlan, *next; 2770 LIST_HEAD(list_kill); 2771 2772 list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { 2773 struct vxlan_rdst *dst = &vxlan->default_dst; 2774 2775 /* In case we created vxlan device with carrier 2776 * and we loose the carrier due to module unload 2777 * we also need to remove vxlan device. In other 2778 * cases, it's not necessary and remote_ifindex 2779 * is 0 here, so no matches. 2780 */ 2781 if (dst->remote_ifindex == dev->ifindex) 2782 vxlan_dellink(vxlan->dev, &list_kill); 2783 } 2784 2785 unregister_netdevice_many(&list_kill); 2786} 2787 2788static int vxlan_lowerdev_event(struct notifier_block *unused, 2789 unsigned long event, void *ptr) 2790{ 2791 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 2792 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); 2793 2794 if (event == NETDEV_UNREGISTER) 2795 vxlan_handle_lowerdev_unregister(vn, dev); 2796 2797 return NOTIFY_DONE; 2798} 2799 2800static struct notifier_block vxlan_notifier_block __read_mostly = { 2801 .notifier_call = vxlan_lowerdev_event, 2802}; 2803 2804static __net_init int vxlan_init_net(struct net *net) 2805{ 2806 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2807 unsigned int h; 2808 2809 INIT_LIST_HEAD(&vn->vxlan_list); 2810 spin_lock_init(&vn->sock_lock); 2811 2812 for (h = 0; h < PORT_HASH_SIZE; ++h) 2813 INIT_HLIST_HEAD(&vn->sock_list[h]); 2814 2815 return 0; 2816} 2817 2818static void __net_exit vxlan_exit_net(struct net *net) 2819{ 2820 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2821 struct vxlan_dev *vxlan, *next; 2822 struct net_device *dev, *aux; 2823 LIST_HEAD(list); 2824 2825 rtnl_lock(); 2826 for_each_netdev_safe(net, dev, aux) 2827 if (dev->rtnl_link_ops == &vxlan_link_ops) 2828 unregister_netdevice_queue(dev, &list); 2829 2830 list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { 2831 /* If vxlan->dev is in the same netns, it has already been added 2832 * to the list by the previous loop. 2833 */ 2834 if (!net_eq(dev_net(vxlan->dev), net)) 2835 unregister_netdevice_queue(dev, &list); 2836 } 2837 2838 unregister_netdevice_many(&list); 2839 rtnl_unlock(); 2840} 2841 2842static struct pernet_operations vxlan_net_ops = { 2843 .init = vxlan_init_net, 2844 .exit = vxlan_exit_net, 2845 .id = &vxlan_net_id, 2846 .size = sizeof(struct vxlan_net), 2847}; 2848 2849static int __init vxlan_init_module(void) 2850{ 2851 int rc; 2852 2853 vxlan_wq = alloc_workqueue("vxlan", 0, 0); 2854 if (!vxlan_wq) 2855 return -ENOMEM; 2856 2857 get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); 2858 2859 rc = register_pernet_subsys(&vxlan_net_ops); 2860 if (rc) 2861 goto out1; 2862 2863 rc = register_netdevice_notifier(&vxlan_notifier_block); 2864 if (rc) 2865 goto out2; 2866 2867 rc = rtnl_link_register(&vxlan_link_ops); 2868 if (rc) 2869 goto out3; 2870 2871 return 0; 2872out3: 2873 unregister_netdevice_notifier(&vxlan_notifier_block); 2874out2: 2875 unregister_pernet_subsys(&vxlan_net_ops); 2876out1: 2877 destroy_workqueue(vxlan_wq); 2878 return rc; 2879} 2880late_initcall(vxlan_init_module); 2881 2882static void __exit vxlan_cleanup_module(void) 2883{ 2884 rtnl_link_unregister(&vxlan_link_ops); 2885 unregister_netdevice_notifier(&vxlan_notifier_block); 2886 destroy_workqueue(vxlan_wq); 2887 unregister_pernet_subsys(&vxlan_net_ops); 2888 /* rcu_barrier() is called by netns */ 2889} 2890module_exit(vxlan_cleanup_module); 2891 2892MODULE_LICENSE("GPL"); 2893MODULE_VERSION(VXLAN_VERSION); 2894MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>"); 2895MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic"); 2896MODULE_ALIAS_RTNL_LINK("vxlan");