Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v4.19-rc2 3857 lines 99 kB view raw
1/* 2 * VXLAN: Virtual eXtensible Local Area Network 3 * 4 * Copyright (c) 2012-2013 Vyatta Inc. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13#include <linux/kernel.h> 14#include <linux/module.h> 15#include <linux/errno.h> 16#include <linux/slab.h> 17#include <linux/udp.h> 18#include <linux/igmp.h> 19#include <linux/if_ether.h> 20#include <linux/ethtool.h> 21#include <net/arp.h> 22#include <net/ndisc.h> 23#include <net/ip.h> 24#include <net/icmp.h> 25#include <net/rtnetlink.h> 26#include <net/inet_ecn.h> 27#include <net/net_namespace.h> 28#include <net/netns/generic.h> 29#include <net/tun_proto.h> 30#include <net/vxlan.h> 31 32#if IS_ENABLED(CONFIG_IPV6) 33#include <net/ip6_tunnel.h> 34#include <net/ip6_checksum.h> 35#endif 36 37#define VXLAN_VERSION "0.1" 38 39#define PORT_HASH_BITS 8 40#define PORT_HASH_SIZE (1<<PORT_HASH_BITS) 41#define FDB_AGE_DEFAULT 300 /* 5 min */ 42#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ 43 44/* UDP port for VXLAN traffic. 45 * The IANA assigned port is 4789, but the Linux default is 8472 46 * for compatibility with early adopters. 47 */ 48static unsigned short vxlan_port __read_mostly = 8472; 49module_param_named(udp_port, vxlan_port, ushort, 0444); 50MODULE_PARM_DESC(udp_port, "Destination UDP port"); 51 52static bool log_ecn_error = true; 53module_param(log_ecn_error, bool, 0644); 54MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 55 56static unsigned int vxlan_net_id; 57static struct rtnl_link_ops vxlan_link_ops; 58 59static const u8 all_zeros_mac[ETH_ALEN + 2]; 60 61static int vxlan_sock_add(struct vxlan_dev *vxlan); 62 63static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); 64 65/* per-network namespace private data for this module */ 66struct vxlan_net { 67 struct list_head vxlan_list; 68 struct hlist_head sock_list[PORT_HASH_SIZE]; 69 spinlock_t sock_lock; 70}; 71 72/* Forwarding table entry */ 73struct vxlan_fdb { 74 struct hlist_node hlist; /* linked list of entries */ 75 struct rcu_head rcu; 76 unsigned long updated; /* jiffies */ 77 unsigned long used; 78 struct list_head remotes; 79 u8 eth_addr[ETH_ALEN]; 80 u16 state; /* see ndm_state */ 81 __be32 vni; 82 u8 flags; /* see ndm_flags */ 83}; 84 85/* salt for hash table */ 86static u32 vxlan_salt __read_mostly; 87 88static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) 89{ 90 return vs->flags & VXLAN_F_COLLECT_METADATA || 91 ip_tunnel_collect_metadata(); 92} 93 94#if IS_ENABLED(CONFIG_IPV6) 95static inline 96bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) 97{ 98 if (a->sa.sa_family != b->sa.sa_family) 99 return false; 100 if (a->sa.sa_family == AF_INET6) 101 return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); 102 else 103 return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; 104} 105 106static inline bool vxlan_addr_any(const union vxlan_addr *ipa) 107{ 108 if (ipa->sa.sa_family == AF_INET6) 109 return ipv6_addr_any(&ipa->sin6.sin6_addr); 110 else 111 return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); 112} 113 114static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) 115{ 116 if (ipa->sa.sa_family == AF_INET6) 117 return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr); 118 else 119 return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); 120} 121 122static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) 123{ 124 if (nla_len(nla) >= sizeof(struct in6_addr)) { 125 ip->sin6.sin6_addr = nla_get_in6_addr(nla); 126 ip->sa.sa_family = AF_INET6; 127 return 0; 128 } else if (nla_len(nla) >= sizeof(__be32)) { 129 ip->sin.sin_addr.s_addr = nla_get_in_addr(nla); 130 ip->sa.sa_family = AF_INET; 131 return 0; 132 } else { 133 return -EAFNOSUPPORT; 134 } 135} 136 137static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, 138 const union vxlan_addr *ip) 139{ 140 if (ip->sa.sa_family == AF_INET6) 141 return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr); 142 else 143 return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr); 144} 145 146#else /* !CONFIG_IPV6 */ 147 148static inline 149bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) 150{ 151 return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; 152} 153 154static inline bool vxlan_addr_any(const union vxlan_addr *ipa) 155{ 156 return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY); 157} 158 159static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa) 160{ 161 return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr)); 162} 163 164static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) 165{ 166 if (nla_len(nla) >= sizeof(struct in6_addr)) { 167 return -EAFNOSUPPORT; 168 } else if (nla_len(nla) >= sizeof(__be32)) { 169 ip->sin.sin_addr.s_addr = nla_get_in_addr(nla); 170 ip->sa.sa_family = AF_INET; 171 return 0; 172 } else { 173 return -EAFNOSUPPORT; 174 } 175} 176 177static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, 178 const union vxlan_addr *ip) 179{ 180 return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr); 181} 182#endif 183 184/* Virtual Network hash table head */ 185static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni) 186{ 187 return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)]; 188} 189 190/* Socket hash table head */ 191static inline struct hlist_head *vs_head(struct net *net, __be16 port) 192{ 193 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 194 195 return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; 196} 197 198/* First remote destination for a forwarding entry. 199 * Guaranteed to be non-NULL because remotes are never deleted. 200 */ 201static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) 202{ 203 return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); 204} 205 206static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) 207{ 208 return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); 209} 210 211/* Find VXLAN socket based on network namespace, address family and UDP port 212 * and enabled unshareable flags. 213 */ 214static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, 215 __be16 port, u32 flags) 216{ 217 struct vxlan_sock *vs; 218 219 flags &= VXLAN_F_RCV_FLAGS; 220 221 hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { 222 if (inet_sk(vs->sock->sk)->inet_sport == port && 223 vxlan_get_sk_family(vs) == family && 224 vs->flags == flags) 225 return vs; 226 } 227 return NULL; 228} 229 230static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex, 231 __be32 vni) 232{ 233 struct vxlan_dev_node *node; 234 235 /* For flow based devices, map all packets to VNI 0 */ 236 if (vs->flags & VXLAN_F_COLLECT_METADATA) 237 vni = 0; 238 239 hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) { 240 if (node->vxlan->default_dst.remote_vni != vni) 241 continue; 242 243 if (IS_ENABLED(CONFIG_IPV6)) { 244 const struct vxlan_config *cfg = &node->vxlan->cfg; 245 246 if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) && 247 cfg->remote_ifindex != ifindex) 248 continue; 249 } 250 251 return node->vxlan; 252 } 253 254 return NULL; 255} 256 257/* Look up VNI in a per net namespace table */ 258static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex, 259 __be32 vni, sa_family_t family, 260 __be16 port, u32 flags) 261{ 262 struct vxlan_sock *vs; 263 264 vs = vxlan_find_sock(net, family, port, flags); 265 if (!vs) 266 return NULL; 267 268 return vxlan_vs_find_vni(vs, ifindex, vni); 269} 270 271/* Fill in neighbour message in skbuff. */ 272static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, 273 const struct vxlan_fdb *fdb, 274 u32 portid, u32 seq, int type, unsigned int flags, 275 const struct vxlan_rdst *rdst) 276{ 277 unsigned long now = jiffies; 278 struct nda_cacheinfo ci; 279 struct nlmsghdr *nlh; 280 struct ndmsg *ndm; 281 bool send_ip, send_eth; 282 283 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); 284 if (nlh == NULL) 285 return -EMSGSIZE; 286 287 ndm = nlmsg_data(nlh); 288 memset(ndm, 0, sizeof(*ndm)); 289 290 send_eth = send_ip = true; 291 292 if (type == RTM_GETNEIGH) { 293 send_ip = !vxlan_addr_any(&rdst->remote_ip); 294 send_eth = !is_zero_ether_addr(fdb->eth_addr); 295 ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; 296 } else 297 ndm->ndm_family = AF_BRIDGE; 298 ndm->ndm_state = fdb->state; 299 ndm->ndm_ifindex = vxlan->dev->ifindex; 300 ndm->ndm_flags = fdb->flags; 301 ndm->ndm_type = RTN_UNICAST; 302 303 if (!net_eq(dev_net(vxlan->dev), vxlan->net) && 304 nla_put_s32(skb, NDA_LINK_NETNSID, 305 peernet2id(dev_net(vxlan->dev), vxlan->net))) 306 goto nla_put_failure; 307 308 if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) 309 goto nla_put_failure; 310 311 if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) 312 goto nla_put_failure; 313 314 if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && 315 nla_put_be16(skb, NDA_PORT, rdst->remote_port)) 316 goto nla_put_failure; 317 if (rdst->remote_vni != vxlan->default_dst.remote_vni && 318 nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) 319 goto nla_put_failure; 320 if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && 321 nla_put_u32(skb, NDA_SRC_VNI, 322 be32_to_cpu(fdb->vni))) 323 goto nla_put_failure; 324 if (rdst->remote_ifindex && 325 nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) 326 goto nla_put_failure; 327 328 ci.ndm_used = jiffies_to_clock_t(now - fdb->used); 329 ci.ndm_confirmed = 0; 330 ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); 331 ci.ndm_refcnt = 0; 332 333 if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) 334 goto nla_put_failure; 335 336 nlmsg_end(skb, nlh); 337 return 0; 338 339nla_put_failure: 340 nlmsg_cancel(skb, nlh); 341 return -EMSGSIZE; 342} 343 344static inline size_t vxlan_nlmsg_size(void) 345{ 346 return NLMSG_ALIGN(sizeof(struct ndmsg)) 347 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ 348 + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ 349 + nla_total_size(sizeof(__be16)) /* NDA_PORT */ 350 + nla_total_size(sizeof(__be32)) /* NDA_VNI */ 351 + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ 352 + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */ 353 + nla_total_size(sizeof(struct nda_cacheinfo)); 354} 355 356static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, 357 struct vxlan_rdst *rd, int type) 358{ 359 struct net *net = dev_net(vxlan->dev); 360 struct sk_buff *skb; 361 int err = -ENOBUFS; 362 363 skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); 364 if (skb == NULL) 365 goto errout; 366 367 err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd); 368 if (err < 0) { 369 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ 370 WARN_ON(err == -EMSGSIZE); 371 kfree_skb(skb); 372 goto errout; 373 } 374 375 rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); 376 return; 377errout: 378 if (err < 0) 379 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); 380} 381 382static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa) 383{ 384 struct vxlan_dev *vxlan = netdev_priv(dev); 385 struct vxlan_fdb f = { 386 .state = NUD_STALE, 387 }; 388 struct vxlan_rdst remote = { 389 .remote_ip = *ipa, /* goes to NDA_DST */ 390 .remote_vni = cpu_to_be32(VXLAN_N_VID), 391 }; 392 393 vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH); 394} 395 396static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) 397{ 398 struct vxlan_fdb f = { 399 .state = NUD_STALE, 400 }; 401 struct vxlan_rdst remote = { }; 402 403 memcpy(f.eth_addr, eth_addr, ETH_ALEN); 404 405 vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH); 406} 407 408/* Hash Ethernet address */ 409static u32 eth_hash(const unsigned char *addr) 410{ 411 u64 value = get_unaligned((u64 *)addr); 412 413 /* only want 6 bytes */ 414#ifdef __BIG_ENDIAN 415 value >>= 16; 416#else 417 value <<= 16; 418#endif 419 return hash_64(value, FDB_HASH_BITS); 420} 421 422static u32 eth_vni_hash(const unsigned char *addr, __be32 vni) 423{ 424 /* use 1 byte of OUI and 3 bytes of NIC */ 425 u32 key = get_unaligned((u32 *)(addr + 2)); 426 427 return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); 428} 429 430/* Hash chain to use given mac address */ 431static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, 432 const u8 *mac, __be32 vni) 433{ 434 if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) 435 return &vxlan->fdb_head[eth_vni_hash(mac, vni)]; 436 else 437 return &vxlan->fdb_head[eth_hash(mac)]; 438} 439 440/* Look up Ethernet address in forwarding table */ 441static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, 442 const u8 *mac, __be32 vni) 443{ 444 struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); 445 struct vxlan_fdb *f; 446 447 hlist_for_each_entry_rcu(f, head, hlist) { 448 if (ether_addr_equal(mac, f->eth_addr)) { 449 if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { 450 if (vni == f->vni) 451 return f; 452 } else { 453 return f; 454 } 455 } 456 } 457 458 return NULL; 459} 460 461static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, 462 const u8 *mac, __be32 vni) 463{ 464 struct vxlan_fdb *f; 465 466 f = __vxlan_find_mac(vxlan, mac, vni); 467 if (f) 468 f->used = jiffies; 469 470 return f; 471} 472 473/* caller should hold vxlan->hash_lock */ 474static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, 475 union vxlan_addr *ip, __be16 port, 476 __be32 vni, __u32 ifindex) 477{ 478 struct vxlan_rdst *rd; 479 480 list_for_each_entry(rd, &f->remotes, list) { 481 if (vxlan_addr_equal(&rd->remote_ip, ip) && 482 rd->remote_port == port && 483 rd->remote_vni == vni && 484 rd->remote_ifindex == ifindex) 485 return rd; 486 } 487 488 return NULL; 489} 490 491/* Replace destination of unicast mac */ 492static int vxlan_fdb_replace(struct vxlan_fdb *f, 493 union vxlan_addr *ip, __be16 port, __be32 vni, 494 __u32 ifindex) 495{ 496 struct vxlan_rdst *rd; 497 498 rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); 499 if (rd) 500 return 0; 501 502 rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); 503 if (!rd) 504 return 0; 505 506 dst_cache_reset(&rd->dst_cache); 507 rd->remote_ip = *ip; 508 rd->remote_port = port; 509 rd->remote_vni = vni; 510 rd->remote_ifindex = ifindex; 511 return 1; 512} 513 514/* Add/update destinations for multicast */ 515static int vxlan_fdb_append(struct vxlan_fdb *f, 516 union vxlan_addr *ip, __be16 port, __be32 vni, 517 __u32 ifindex, struct vxlan_rdst **rdp) 518{ 519 struct vxlan_rdst *rd; 520 521 rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); 522 if (rd) 523 return 0; 524 525 rd = kmalloc(sizeof(*rd), GFP_ATOMIC); 526 if (rd == NULL) 527 return -ENOBUFS; 528 529 if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { 530 kfree(rd); 531 return -ENOBUFS; 532 } 533 534 rd->remote_ip = *ip; 535 rd->remote_port = port; 536 rd->remote_vni = vni; 537 rd->remote_ifindex = ifindex; 538 539 list_add_tail_rcu(&rd->list, &f->remotes); 540 541 *rdp = rd; 542 return 1; 543} 544 545static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, 546 unsigned int off, 547 struct vxlanhdr *vh, size_t hdrlen, 548 __be32 vni_field, 549 struct gro_remcsum *grc, 550 bool nopartial) 551{ 552 size_t start, offset; 553 554 if (skb->remcsum_offload) 555 return vh; 556 557 if (!NAPI_GRO_CB(skb)->csum_valid) 558 return NULL; 559 560 start = vxlan_rco_start(vni_field); 561 offset = start + vxlan_rco_offset(vni_field); 562 563 vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, 564 start, offset, grc, nopartial); 565 566 skb->remcsum_offload = 1; 567 568 return vh; 569} 570 571static struct sk_buff *vxlan_gro_receive(struct sock *sk, 572 struct list_head *head, 573 struct sk_buff *skb) 574{ 575 struct sk_buff *pp = NULL; 576 struct sk_buff *p; 577 struct vxlanhdr *vh, *vh2; 578 unsigned int hlen, off_vx; 579 int flush = 1; 580 struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk); 581 __be32 flags; 582 struct gro_remcsum grc; 583 584 skb_gro_remcsum_init(&grc); 585 586 off_vx = skb_gro_offset(skb); 587 hlen = off_vx + sizeof(*vh); 588 vh = skb_gro_header_fast(skb, off_vx); 589 if (skb_gro_header_hard(skb, hlen)) { 590 vh = skb_gro_header_slow(skb, hlen, off_vx); 591 if (unlikely(!vh)) 592 goto out; 593 } 594 595 skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); 596 597 flags = vh->vx_flags; 598 599 if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { 600 vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), 601 vh->vx_vni, &grc, 602 !!(vs->flags & 603 VXLAN_F_REMCSUM_NOPARTIAL)); 604 605 if (!vh) 606 goto out; 607 } 608 609 skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ 610 611 list_for_each_entry(p, head, list) { 612 if (!NAPI_GRO_CB(p)->same_flow) 613 continue; 614 615 vh2 = (struct vxlanhdr *)(p->data + off_vx); 616 if (vh->vx_flags != vh2->vx_flags || 617 vh->vx_vni != vh2->vx_vni) { 618 NAPI_GRO_CB(p)->same_flow = 0; 619 continue; 620 } 621 } 622 623 pp = call_gro_receive(eth_gro_receive, head, skb); 624 flush = 0; 625 626out: 627 skb_gro_flush_final_remcsum(skb, pp, flush, &grc); 628 629 return pp; 630} 631 632static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) 633{ 634 /* Sets 'skb->inner_mac_header' since we are always called with 635 * 'skb->encapsulation' set. 636 */ 637 return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); 638} 639 640static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, 641 const u8 *mac, __u16 state, 642 __be32 src_vni, __u8 ndm_flags) 643{ 644 struct vxlan_fdb *f; 645 646 f = kmalloc(sizeof(*f), GFP_ATOMIC); 647 if (!f) 648 return NULL; 649 f->state = state; 650 f->flags = ndm_flags; 651 f->updated = f->used = jiffies; 652 f->vni = src_vni; 653 INIT_LIST_HEAD(&f->remotes); 654 memcpy(f->eth_addr, mac, ETH_ALEN); 655 656 return f; 657} 658 659static int vxlan_fdb_create(struct vxlan_dev *vxlan, 660 const u8 *mac, union vxlan_addr *ip, 661 __u16 state, __be16 port, __be32 src_vni, 662 __be32 vni, __u32 ifindex, __u8 ndm_flags, 663 struct vxlan_fdb **fdb) 664{ 665 struct vxlan_rdst *rd = NULL; 666 struct vxlan_fdb *f; 667 int rc; 668 669 if (vxlan->cfg.addrmax && 670 vxlan->addrcnt >= vxlan->cfg.addrmax) 671 return -ENOSPC; 672 673 netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); 674 f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags); 675 if (!f) 676 return -ENOMEM; 677 678 rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); 679 if (rc < 0) { 680 kfree(f); 681 return rc; 682 } 683 684 ++vxlan->addrcnt; 685 hlist_add_head_rcu(&f->hlist, 686 vxlan_fdb_head(vxlan, mac, src_vni)); 687 688 *fdb = f; 689 690 return 0; 691} 692 693/* Add new entry to forwarding table -- assumes lock held */ 694static int vxlan_fdb_update(struct vxlan_dev *vxlan, 695 const u8 *mac, union vxlan_addr *ip, 696 __u16 state, __u16 flags, 697 __be16 port, __be32 src_vni, __be32 vni, 698 __u32 ifindex, __u8 ndm_flags) 699{ 700 struct vxlan_rdst *rd = NULL; 701 struct vxlan_fdb *f; 702 int notify = 0; 703 int rc; 704 705 f = __vxlan_find_mac(vxlan, mac, src_vni); 706 if (f) { 707 if (flags & NLM_F_EXCL) { 708 netdev_dbg(vxlan->dev, 709 "lost race to create %pM\n", mac); 710 return -EEXIST; 711 } 712 if (f->state != state) { 713 f->state = state; 714 f->updated = jiffies; 715 notify = 1; 716 } 717 if (f->flags != ndm_flags) { 718 f->flags = ndm_flags; 719 f->updated = jiffies; 720 notify = 1; 721 } 722 if ((flags & NLM_F_REPLACE)) { 723 /* Only change unicasts */ 724 if (!(is_multicast_ether_addr(f->eth_addr) || 725 is_zero_ether_addr(f->eth_addr))) { 726 notify |= vxlan_fdb_replace(f, ip, port, vni, 727 ifindex); 728 } else 729 return -EOPNOTSUPP; 730 } 731 if ((flags & NLM_F_APPEND) && 732 (is_multicast_ether_addr(f->eth_addr) || 733 is_zero_ether_addr(f->eth_addr))) { 734 rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); 735 736 if (rc < 0) 737 return rc; 738 notify |= rc; 739 } 740 } else { 741 if (!(flags & NLM_F_CREATE)) 742 return -ENOENT; 743 744 /* Disallow replace to add a multicast entry */ 745 if ((flags & NLM_F_REPLACE) && 746 (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) 747 return -EOPNOTSUPP; 748 749 netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); 750 rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, 751 vni, ifindex, ndm_flags, &f); 752 if (rc < 0) 753 return rc; 754 notify = 1; 755 } 756 757 if (notify) { 758 if (rd == NULL) 759 rd = first_remote_rtnl(f); 760 vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH); 761 } 762 763 return 0; 764} 765 766static void vxlan_fdb_free(struct rcu_head *head) 767{ 768 struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); 769 struct vxlan_rdst *rd, *nd; 770 771 list_for_each_entry_safe(rd, nd, &f->remotes, list) { 772 dst_cache_destroy(&rd->dst_cache); 773 kfree(rd); 774 } 775 kfree(f); 776} 777 778static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, 779 bool do_notify) 780{ 781 netdev_dbg(vxlan->dev, 782 "delete %pM\n", f->eth_addr); 783 784 --vxlan->addrcnt; 785 if (do_notify) 786 vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH); 787 788 hlist_del_rcu(&f->hlist); 789 call_rcu(&f->rcu, vxlan_fdb_free); 790} 791 792static void vxlan_dst_free(struct rcu_head *head) 793{ 794 struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); 795 796 dst_cache_destroy(&rd->dst_cache); 797 kfree(rd); 798} 799 800static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, 801 struct vxlan_rdst *rd) 802{ 803 list_del_rcu(&rd->list); 804 vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH); 805 call_rcu(&rd->rcu, vxlan_dst_free); 806} 807 808static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, 809 union vxlan_addr *ip, __be16 *port, __be32 *src_vni, 810 __be32 *vni, u32 *ifindex) 811{ 812 struct net *net = dev_net(vxlan->dev); 813 int err; 814 815 if (tb[NDA_DST]) { 816 err = vxlan_nla_get_addr(ip, tb[NDA_DST]); 817 if (err) 818 return err; 819 } else { 820 union vxlan_addr *remote = &vxlan->default_dst.remote_ip; 821 if (remote->sa.sa_family == AF_INET) { 822 ip->sin.sin_addr.s_addr = htonl(INADDR_ANY); 823 ip->sa.sa_family = AF_INET; 824#if IS_ENABLED(CONFIG_IPV6) 825 } else { 826 ip->sin6.sin6_addr = in6addr_any; 827 ip->sa.sa_family = AF_INET6; 828#endif 829 } 830 } 831 832 if (tb[NDA_PORT]) { 833 if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) 834 return -EINVAL; 835 *port = nla_get_be16(tb[NDA_PORT]); 836 } else { 837 *port = vxlan->cfg.dst_port; 838 } 839 840 if (tb[NDA_VNI]) { 841 if (nla_len(tb[NDA_VNI]) != sizeof(u32)) 842 return -EINVAL; 843 *vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); 844 } else { 845 *vni = vxlan->default_dst.remote_vni; 846 } 847 848 if (tb[NDA_SRC_VNI]) { 849 if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32)) 850 return -EINVAL; 851 *src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); 852 } else { 853 *src_vni = vxlan->default_dst.remote_vni; 854 } 855 856 if (tb[NDA_IFINDEX]) { 857 struct net_device *tdev; 858 859 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) 860 return -EINVAL; 861 *ifindex = nla_get_u32(tb[NDA_IFINDEX]); 862 tdev = __dev_get_by_index(net, *ifindex); 863 if (!tdev) 864 return -EADDRNOTAVAIL; 865 } else { 866 *ifindex = 0; 867 } 868 869 return 0; 870} 871 872/* Add static entry (via netlink) */ 873static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], 874 struct net_device *dev, 875 const unsigned char *addr, u16 vid, u16 flags) 876{ 877 struct vxlan_dev *vxlan = netdev_priv(dev); 878 /* struct net *net = dev_net(vxlan->dev); */ 879 union vxlan_addr ip; 880 __be16 port; 881 __be32 src_vni, vni; 882 u32 ifindex; 883 int err; 884 885 if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { 886 pr_info("RTM_NEWNEIGH with invalid state %#x\n", 887 ndm->ndm_state); 888 return -EINVAL; 889 } 890 891 if (tb[NDA_DST] == NULL) 892 return -EINVAL; 893 894 err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); 895 if (err) 896 return err; 897 898 if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) 899 return -EAFNOSUPPORT; 900 901 spin_lock_bh(&vxlan->hash_lock); 902 err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, 903 port, src_vni, vni, ifindex, ndm->ndm_flags); 904 spin_unlock_bh(&vxlan->hash_lock); 905 906 return err; 907} 908 909static int __vxlan_fdb_delete(struct vxlan_dev *vxlan, 910 const unsigned char *addr, union vxlan_addr ip, 911 __be16 port, __be32 src_vni, __be32 vni, 912 u32 ifindex, u16 vid) 913{ 914 struct vxlan_fdb *f; 915 struct vxlan_rdst *rd = NULL; 916 int err = -ENOENT; 917 918 f = vxlan_find_mac(vxlan, addr, src_vni); 919 if (!f) 920 return err; 921 922 if (!vxlan_addr_any(&ip)) { 923 rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex); 924 if (!rd) 925 goto out; 926 } 927 928 /* remove a destination if it's not the only one on the list, 929 * otherwise destroy the fdb entry 930 */ 931 if (rd && !list_is_singular(&f->remotes)) { 932 vxlan_fdb_dst_destroy(vxlan, f, rd); 933 goto out; 934 } 935 936 vxlan_fdb_destroy(vxlan, f, true); 937 938out: 939 return 0; 940} 941 942/* Delete entry (via netlink) */ 943static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], 944 struct net_device *dev, 945 const unsigned char *addr, u16 vid) 946{ 947 struct vxlan_dev *vxlan = netdev_priv(dev); 948 union vxlan_addr ip; 949 __be32 src_vni, vni; 950 __be16 port; 951 u32 ifindex; 952 int err; 953 954 err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); 955 if (err) 956 return err; 957 958 spin_lock_bh(&vxlan->hash_lock); 959 err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, 960 vid); 961 spin_unlock_bh(&vxlan->hash_lock); 962 963 return err; 964} 965 966/* Dump forwarding table */ 967static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, 968 struct net_device *dev, 969 struct net_device *filter_dev, int *idx) 970{ 971 struct vxlan_dev *vxlan = netdev_priv(dev); 972 unsigned int h; 973 int err = 0; 974 975 for (h = 0; h < FDB_HASH_SIZE; ++h) { 976 struct vxlan_fdb *f; 977 978 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { 979 struct vxlan_rdst *rd; 980 981 list_for_each_entry_rcu(rd, &f->remotes, list) { 982 if (*idx < cb->args[2]) 983 goto skip; 984 985 err = vxlan_fdb_info(skb, vxlan, f, 986 NETLINK_CB(cb->skb).portid, 987 cb->nlh->nlmsg_seq, 988 RTM_NEWNEIGH, 989 NLM_F_MULTI, rd); 990 if (err < 0) 991 goto out; 992skip: 993 *idx += 1; 994 } 995 } 996 } 997out: 998 return err; 999} 1000 1001/* Watch incoming packets to learn mapping between Ethernet address 1002 * and Tunnel endpoint. 1003 * Return true if packet is bogus and should be dropped. 1004 */ 1005static bool vxlan_snoop(struct net_device *dev, 1006 union vxlan_addr *src_ip, const u8 *src_mac, 1007 u32 src_ifindex, __be32 vni) 1008{ 1009 struct vxlan_dev *vxlan = netdev_priv(dev); 1010 struct vxlan_fdb *f; 1011 u32 ifindex = 0; 1012 1013#if IS_ENABLED(CONFIG_IPV6) 1014 if (src_ip->sa.sa_family == AF_INET6 && 1015 (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)) 1016 ifindex = src_ifindex; 1017#endif 1018 1019 f = vxlan_find_mac(vxlan, src_mac, vni); 1020 if (likely(f)) { 1021 struct vxlan_rdst *rdst = first_remote_rcu(f); 1022 1023 if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && 1024 rdst->remote_ifindex == ifindex)) 1025 return false; 1026 1027 /* Don't migrate static entries, drop packets */ 1028 if (f->state & (NUD_PERMANENT | NUD_NOARP)) 1029 return true; 1030 1031 if (net_ratelimit()) 1032 netdev_info(dev, 1033 "%pM migrated from %pIS to %pIS\n", 1034 src_mac, &rdst->remote_ip.sa, &src_ip->sa); 1035 1036 rdst->remote_ip = *src_ip; 1037 f->updated = jiffies; 1038 vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH); 1039 } else { 1040 /* learned new entry */ 1041 spin_lock(&vxlan->hash_lock); 1042 1043 /* close off race between vxlan_flush and incoming packets */ 1044 if (netif_running(dev)) 1045 vxlan_fdb_update(vxlan, src_mac, src_ip, 1046 NUD_REACHABLE, 1047 NLM_F_EXCL|NLM_F_CREATE, 1048 vxlan->cfg.dst_port, 1049 vni, 1050 vxlan->default_dst.remote_vni, 1051 ifindex, NTF_SELF); 1052 spin_unlock(&vxlan->hash_lock); 1053 } 1054 1055 return false; 1056} 1057 1058/* See if multicast group is already in use by other ID */ 1059static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) 1060{ 1061 struct vxlan_dev *vxlan; 1062 struct vxlan_sock *sock4; 1063#if IS_ENABLED(CONFIG_IPV6) 1064 struct vxlan_sock *sock6; 1065#endif 1066 unsigned short family = dev->default_dst.remote_ip.sa.sa_family; 1067 1068 sock4 = rtnl_dereference(dev->vn4_sock); 1069 1070 /* The vxlan_sock is only used by dev, leaving group has 1071 * no effect on other vxlan devices. 1072 */ 1073 if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1) 1074 return false; 1075#if IS_ENABLED(CONFIG_IPV6) 1076 sock6 = rtnl_dereference(dev->vn6_sock); 1077 if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1) 1078 return false; 1079#endif 1080 1081 list_for_each_entry(vxlan, &vn->vxlan_list, next) { 1082 if (!netif_running(vxlan->dev) || vxlan == dev) 1083 continue; 1084 1085 if (family == AF_INET && 1086 rtnl_dereference(vxlan->vn4_sock) != sock4) 1087 continue; 1088#if IS_ENABLED(CONFIG_IPV6) 1089 if (family == AF_INET6 && 1090 rtnl_dereference(vxlan->vn6_sock) != sock6) 1091 continue; 1092#endif 1093 1094 if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip, 1095 &dev->default_dst.remote_ip)) 1096 continue; 1097 1098 if (vxlan->default_dst.remote_ifindex != 1099 dev->default_dst.remote_ifindex) 1100 continue; 1101 1102 return true; 1103 } 1104 1105 return false; 1106} 1107 1108static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) 1109{ 1110 struct vxlan_net *vn; 1111 1112 if (!vs) 1113 return false; 1114 if (!refcount_dec_and_test(&vs->refcnt)) 1115 return false; 1116 1117 vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); 1118 spin_lock(&vn->sock_lock); 1119 hlist_del_rcu(&vs->hlist); 1120 udp_tunnel_notify_del_rx_port(vs->sock, 1121 (vs->flags & VXLAN_F_GPE) ? 1122 UDP_TUNNEL_TYPE_VXLAN_GPE : 1123 UDP_TUNNEL_TYPE_VXLAN); 1124 spin_unlock(&vn->sock_lock); 1125 1126 return true; 1127} 1128 1129static void vxlan_sock_release(struct vxlan_dev *vxlan) 1130{ 1131 struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); 1132#if IS_ENABLED(CONFIG_IPV6) 1133 struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); 1134 1135 RCU_INIT_POINTER(vxlan->vn6_sock, NULL); 1136#endif 1137 1138 RCU_INIT_POINTER(vxlan->vn4_sock, NULL); 1139 synchronize_net(); 1140 1141 vxlan_vs_del_dev(vxlan); 1142 1143 if (__vxlan_sock_release_prep(sock4)) { 1144 udp_tunnel_sock_release(sock4->sock); 1145 kfree(sock4); 1146 } 1147 1148#if IS_ENABLED(CONFIG_IPV6) 1149 if (__vxlan_sock_release_prep(sock6)) { 1150 udp_tunnel_sock_release(sock6->sock); 1151 kfree(sock6); 1152 } 1153#endif 1154} 1155 1156/* Update multicast group membership when first VNI on 1157 * multicast address is brought up 1158 */ 1159static int vxlan_igmp_join(struct vxlan_dev *vxlan) 1160{ 1161 struct sock *sk; 1162 union vxlan_addr *ip = &vxlan->default_dst.remote_ip; 1163 int ifindex = vxlan->default_dst.remote_ifindex; 1164 int ret = -EINVAL; 1165 1166 if (ip->sa.sa_family == AF_INET) { 1167 struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); 1168 struct ip_mreqn mreq = { 1169 .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, 1170 .imr_ifindex = ifindex, 1171 }; 1172 1173 sk = sock4->sock->sk; 1174 lock_sock(sk); 1175 ret = ip_mc_join_group(sk, &mreq); 1176 release_sock(sk); 1177#if IS_ENABLED(CONFIG_IPV6) 1178 } else { 1179 struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); 1180 1181 sk = sock6->sock->sk; 1182 lock_sock(sk); 1183 ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex, 1184 &ip->sin6.sin6_addr); 1185 release_sock(sk); 1186#endif 1187 } 1188 1189 return ret; 1190} 1191 1192/* Inverse of vxlan_igmp_join when last VNI is brought down */ 1193static int vxlan_igmp_leave(struct vxlan_dev *vxlan) 1194{ 1195 struct sock *sk; 1196 union vxlan_addr *ip = &vxlan->default_dst.remote_ip; 1197 int ifindex = vxlan->default_dst.remote_ifindex; 1198 int ret = -EINVAL; 1199 1200 if (ip->sa.sa_family == AF_INET) { 1201 struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); 1202 struct ip_mreqn mreq = { 1203 .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, 1204 .imr_ifindex = ifindex, 1205 }; 1206 1207 sk = sock4->sock->sk; 1208 lock_sock(sk); 1209 ret = ip_mc_leave_group(sk, &mreq); 1210 release_sock(sk); 1211#if IS_ENABLED(CONFIG_IPV6) 1212 } else { 1213 struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); 1214 1215 sk = sock6->sock->sk; 1216 lock_sock(sk); 1217 ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, 1218 &ip->sin6.sin6_addr); 1219 release_sock(sk); 1220#endif 1221 } 1222 1223 return ret; 1224} 1225 1226static bool vxlan_remcsum(struct vxlanhdr *unparsed, 1227 struct sk_buff *skb, u32 vxflags) 1228{ 1229 size_t start, offset; 1230 1231 if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) 1232 goto out; 1233 1234 start = vxlan_rco_start(unparsed->vx_vni); 1235 offset = start + vxlan_rco_offset(unparsed->vx_vni); 1236 1237 if (!pskb_may_pull(skb, offset + sizeof(u16))) 1238 return false; 1239 1240 skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, 1241 !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); 1242out: 1243 unparsed->vx_flags &= ~VXLAN_HF_RCO; 1244 unparsed->vx_vni &= VXLAN_VNI_MASK; 1245 return true; 1246} 1247 1248static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, 1249 struct sk_buff *skb, u32 vxflags, 1250 struct vxlan_metadata *md) 1251{ 1252 struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; 1253 struct metadata_dst *tun_dst; 1254 1255 if (!(unparsed->vx_flags & VXLAN_HF_GBP)) 1256 goto out; 1257 1258 md->gbp = ntohs(gbp->policy_id); 1259 1260 tun_dst = (struct metadata_dst *)skb_dst(skb); 1261 if (tun_dst) { 1262 tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; 1263 tun_dst->u.tun_info.options_len = sizeof(*md); 1264 } 1265 if (gbp->dont_learn) 1266 md->gbp |= VXLAN_GBP_DONT_LEARN; 1267 1268 if (gbp->policy_applied) 1269 md->gbp |= VXLAN_GBP_POLICY_APPLIED; 1270 1271 /* In flow-based mode, GBP is carried in dst_metadata */ 1272 if (!(vxflags & VXLAN_F_COLLECT_METADATA)) 1273 skb->mark = md->gbp; 1274out: 1275 unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; 1276} 1277 1278static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed, 1279 __be16 *protocol, 1280 struct sk_buff *skb, u32 vxflags) 1281{ 1282 struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed; 1283 1284 /* Need to have Next Protocol set for interfaces in GPE mode. */ 1285 if (!gpe->np_applied) 1286 return false; 1287 /* "The initial version is 0. If a receiver does not support the 1288 * version indicated it MUST drop the packet. 1289 */ 1290 if (gpe->version != 0) 1291 return false; 1292 /* "When the O bit is set to 1, the packet is an OAM packet and OAM 1293 * processing MUST occur." However, we don't implement OAM 1294 * processing, thus drop the packet. 1295 */ 1296 if (gpe->oam_flag) 1297 return false; 1298 1299 *protocol = tun_p_to_eth_p(gpe->next_protocol); 1300 if (!*protocol) 1301 return false; 1302 1303 unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS; 1304 return true; 1305} 1306 1307static bool vxlan_set_mac(struct vxlan_dev *vxlan, 1308 struct vxlan_sock *vs, 1309 struct sk_buff *skb, __be32 vni) 1310{ 1311 union vxlan_addr saddr; 1312 u32 ifindex = skb->dev->ifindex; 1313 1314 skb_reset_mac_header(skb); 1315 skb->protocol = eth_type_trans(skb, vxlan->dev); 1316 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1317 1318 /* Ignore packet loops (and multicast echo) */ 1319 if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) 1320 return false; 1321 1322 /* Get address from the outer IP header */ 1323 if (vxlan_get_sk_family(vs) == AF_INET) { 1324 saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; 1325 saddr.sa.sa_family = AF_INET; 1326#if IS_ENABLED(CONFIG_IPV6) 1327 } else { 1328 saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr; 1329 saddr.sa.sa_family = AF_INET6; 1330#endif 1331 } 1332 1333 if ((vxlan->cfg.flags & VXLAN_F_LEARN) && 1334 vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni)) 1335 return false; 1336 1337 return true; 1338} 1339 1340static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, 1341 struct sk_buff *skb) 1342{ 1343 int err = 0; 1344 1345 if (vxlan_get_sk_family(vs) == AF_INET) 1346 err = IP_ECN_decapsulate(oiph, skb); 1347#if IS_ENABLED(CONFIG_IPV6) 1348 else 1349 err = IP6_ECN_decapsulate(oiph, skb); 1350#endif 1351 1352 if (unlikely(err) && log_ecn_error) { 1353 if (vxlan_get_sk_family(vs) == AF_INET) 1354 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 1355 &((struct iphdr *)oiph)->saddr, 1356 ((struct iphdr *)oiph)->tos); 1357 else 1358 net_info_ratelimited("non-ECT from %pI6\n", 1359 &((struct ipv6hdr *)oiph)->saddr); 1360 } 1361 return err <= 1; 1362} 1363 1364/* Callback from net/ipv4/udp.c to receive packets */ 1365static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) 1366{ 1367 struct pcpu_sw_netstats *stats; 1368 struct vxlan_dev *vxlan; 1369 struct vxlan_sock *vs; 1370 struct vxlanhdr unparsed; 1371 struct vxlan_metadata _md; 1372 struct vxlan_metadata *md = &_md; 1373 __be16 protocol = htons(ETH_P_TEB); 1374 bool raw_proto = false; 1375 void *oiph; 1376 __be32 vni = 0; 1377 1378 /* Need UDP and VXLAN header to be present */ 1379 if (!pskb_may_pull(skb, VXLAN_HLEN)) 1380 goto drop; 1381 1382 unparsed = *vxlan_hdr(skb); 1383 /* VNI flag always required to be set */ 1384 if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { 1385 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", 1386 ntohl(vxlan_hdr(skb)->vx_flags), 1387 ntohl(vxlan_hdr(skb)->vx_vni)); 1388 /* Return non vxlan pkt */ 1389 goto drop; 1390 } 1391 unparsed.vx_flags &= ~VXLAN_HF_VNI; 1392 unparsed.vx_vni &= ~VXLAN_VNI_MASK; 1393 1394 vs = rcu_dereference_sk_user_data(sk); 1395 if (!vs) 1396 goto drop; 1397 1398 vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); 1399 1400 vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni); 1401 if (!vxlan) 1402 goto drop; 1403 1404 /* For backwards compatibility, only allow reserved fields to be 1405 * used by VXLAN extensions if explicitly requested. 1406 */ 1407 if (vs->flags & VXLAN_F_GPE) { 1408 if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags)) 1409 goto drop; 1410 raw_proto = true; 1411 } 1412 1413 if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, 1414 !net_eq(vxlan->net, dev_net(vxlan->dev)))) 1415 goto drop; 1416 1417 if (vxlan_collect_metadata(vs)) { 1418 struct metadata_dst *tun_dst; 1419 1420 tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY, 1421 key32_to_tunnel_id(vni), sizeof(*md)); 1422 1423 if (!tun_dst) 1424 goto drop; 1425 1426 md = ip_tunnel_info_opts(&tun_dst->u.tun_info); 1427 1428 skb_dst_set(skb, (struct dst_entry *)tun_dst); 1429 } else { 1430 memset(md, 0, sizeof(*md)); 1431 } 1432 1433 if (vs->flags & VXLAN_F_REMCSUM_RX) 1434 if (!vxlan_remcsum(&unparsed, skb, vs->flags)) 1435 goto drop; 1436 if (vs->flags & VXLAN_F_GBP) 1437 vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); 1438 /* Note that GBP and GPE can never be active together. This is 1439 * ensured in vxlan_dev_configure. 1440 */ 1441 1442 if (unparsed.vx_flags || unparsed.vx_vni) { 1443 /* If there are any unprocessed flags remaining treat 1444 * this as a malformed packet. This behavior diverges from 1445 * VXLAN RFC (RFC7348) which stipulates that bits in reserved 1446 * in reserved fields are to be ignored. The approach here 1447 * maintains compatibility with previous stack code, and also 1448 * is more robust and provides a little more security in 1449 * adding extensions to VXLAN. 1450 */ 1451 goto drop; 1452 } 1453 1454 if (!raw_proto) { 1455 if (!vxlan_set_mac(vxlan, vs, skb, vni)) 1456 goto drop; 1457 } else { 1458 skb_reset_mac_header(skb); 1459 skb->dev = vxlan->dev; 1460 skb->pkt_type = PACKET_HOST; 1461 } 1462 1463 oiph = skb_network_header(skb); 1464 skb_reset_network_header(skb); 1465 1466 if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { 1467 ++vxlan->dev->stats.rx_frame_errors; 1468 ++vxlan->dev->stats.rx_errors; 1469 goto drop; 1470 } 1471 1472 stats = this_cpu_ptr(vxlan->dev->tstats); 1473 u64_stats_update_begin(&stats->syncp); 1474 stats->rx_packets++; 1475 stats->rx_bytes += skb->len; 1476 u64_stats_update_end(&stats->syncp); 1477 1478 gro_cells_receive(&vxlan->gro_cells, skb); 1479 return 0; 1480 1481drop: 1482 /* Consume bad packet */ 1483 kfree_skb(skb); 1484 return 0; 1485} 1486 1487static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) 1488{ 1489 struct vxlan_dev *vxlan = netdev_priv(dev); 1490 struct arphdr *parp; 1491 u8 *arpptr, *sha; 1492 __be32 sip, tip; 1493 struct neighbour *n; 1494 1495 if (dev->flags & IFF_NOARP) 1496 goto out; 1497 1498 if (!pskb_may_pull(skb, arp_hdr_len(dev))) { 1499 dev->stats.tx_dropped++; 1500 goto out; 1501 } 1502 parp = arp_hdr(skb); 1503 1504 if ((parp->ar_hrd != htons(ARPHRD_ETHER) && 1505 parp->ar_hrd != htons(ARPHRD_IEEE802)) || 1506 parp->ar_pro != htons(ETH_P_IP) || 1507 parp->ar_op != htons(ARPOP_REQUEST) || 1508 parp->ar_hln != dev->addr_len || 1509 parp->ar_pln != 4) 1510 goto out; 1511 arpptr = (u8 *)parp + sizeof(struct arphdr); 1512 sha = arpptr; 1513 arpptr += dev->addr_len; /* sha */ 1514 memcpy(&sip, arpptr, sizeof(sip)); 1515 arpptr += sizeof(sip); 1516 arpptr += dev->addr_len; /* tha */ 1517 memcpy(&tip, arpptr, sizeof(tip)); 1518 1519 if (ipv4_is_loopback(tip) || 1520 ipv4_is_multicast(tip)) 1521 goto out; 1522 1523 n = neigh_lookup(&arp_tbl, &tip, dev); 1524 1525 if (n) { 1526 struct vxlan_fdb *f; 1527 struct sk_buff *reply; 1528 1529 if (!(n->nud_state & NUD_CONNECTED)) { 1530 neigh_release(n); 1531 goto out; 1532 } 1533 1534 f = vxlan_find_mac(vxlan, n->ha, vni); 1535 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { 1536 /* bridge-local neighbor */ 1537 neigh_release(n); 1538 goto out; 1539 } 1540 1541 reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, 1542 n->ha, sha); 1543 1544 neigh_release(n); 1545 1546 if (reply == NULL) 1547 goto out; 1548 1549 skb_reset_mac_header(reply); 1550 __skb_pull(reply, skb_network_offset(reply)); 1551 reply->ip_summed = CHECKSUM_UNNECESSARY; 1552 reply->pkt_type = PACKET_HOST; 1553 1554 if (netif_rx_ni(reply) == NET_RX_DROP) 1555 dev->stats.rx_dropped++; 1556 } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { 1557 union vxlan_addr ipa = { 1558 .sin.sin_addr.s_addr = tip, 1559 .sin.sin_family = AF_INET, 1560 }; 1561 1562 vxlan_ip_miss(dev, &ipa); 1563 } 1564out: 1565 consume_skb(skb); 1566 return NETDEV_TX_OK; 1567} 1568 1569#if IS_ENABLED(CONFIG_IPV6) 1570static struct sk_buff *vxlan_na_create(struct sk_buff *request, 1571 struct neighbour *n, bool isrouter) 1572{ 1573 struct net_device *dev = request->dev; 1574 struct sk_buff *reply; 1575 struct nd_msg *ns, *na; 1576 struct ipv6hdr *pip6; 1577 u8 *daddr; 1578 int na_olen = 8; /* opt hdr + ETH_ALEN for target */ 1579 int ns_olen; 1580 int i, len; 1581 1582 if (dev == NULL || !pskb_may_pull(request, request->len)) 1583 return NULL; 1584 1585 len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + 1586 sizeof(*na) + na_olen + dev->needed_tailroom; 1587 reply = alloc_skb(len, GFP_ATOMIC); 1588 if (reply == NULL) 1589 return NULL; 1590 1591 reply->protocol = htons(ETH_P_IPV6); 1592 reply->dev = dev; 1593 skb_reserve(reply, LL_RESERVED_SPACE(request->dev)); 1594 skb_push(reply, sizeof(struct ethhdr)); 1595 skb_reset_mac_header(reply); 1596 1597 ns = (struct nd_msg *)(ipv6_hdr(request) + 1); 1598 1599 daddr = eth_hdr(request)->h_source; 1600 ns_olen = request->len - skb_network_offset(request) - 1601 sizeof(struct ipv6hdr) - sizeof(*ns); 1602 for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { 1603 if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { 1604 daddr = ns->opt + i + sizeof(struct nd_opt_hdr); 1605 break; 1606 } 1607 } 1608 1609 /* Ethernet header */ 1610 ether_addr_copy(eth_hdr(reply)->h_dest, daddr); 1611 ether_addr_copy(eth_hdr(reply)->h_source, n->ha); 1612 eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); 1613 reply->protocol = htons(ETH_P_IPV6); 1614 1615 skb_pull(reply, sizeof(struct ethhdr)); 1616 skb_reset_network_header(reply); 1617 skb_put(reply, sizeof(struct ipv6hdr)); 1618 1619 /* IPv6 header */ 1620 1621 pip6 = ipv6_hdr(reply); 1622 memset(pip6, 0, sizeof(struct ipv6hdr)); 1623 pip6->version = 6; 1624 pip6->priority = ipv6_hdr(request)->priority; 1625 pip6->nexthdr = IPPROTO_ICMPV6; 1626 pip6->hop_limit = 255; 1627 pip6->daddr = ipv6_hdr(request)->saddr; 1628 pip6->saddr = *(struct in6_addr *)n->primary_key; 1629 1630 skb_pull(reply, sizeof(struct ipv6hdr)); 1631 skb_reset_transport_header(reply); 1632 1633 /* Neighbor Advertisement */ 1634 na = skb_put_zero(reply, sizeof(*na) + na_olen); 1635 na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; 1636 na->icmph.icmp6_router = isrouter; 1637 na->icmph.icmp6_override = 1; 1638 na->icmph.icmp6_solicited = 1; 1639 na->target = ns->target; 1640 ether_addr_copy(&na->opt[2], n->ha); 1641 na->opt[0] = ND_OPT_TARGET_LL_ADDR; 1642 na->opt[1] = na_olen >> 3; 1643 1644 na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, 1645 &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6, 1646 csum_partial(na, sizeof(*na)+na_olen, 0)); 1647 1648 pip6->payload_len = htons(sizeof(*na)+na_olen); 1649 1650 skb_push(reply, sizeof(struct ipv6hdr)); 1651 1652 reply->ip_summed = CHECKSUM_UNNECESSARY; 1653 1654 return reply; 1655} 1656 1657static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) 1658{ 1659 struct vxlan_dev *vxlan = netdev_priv(dev); 1660 const struct in6_addr *daddr; 1661 const struct ipv6hdr *iphdr; 1662 struct inet6_dev *in6_dev; 1663 struct neighbour *n; 1664 struct nd_msg *msg; 1665 1666 in6_dev = __in6_dev_get(dev); 1667 if (!in6_dev) 1668 goto out; 1669 1670 iphdr = ipv6_hdr(skb); 1671 daddr = &iphdr->daddr; 1672 msg = (struct nd_msg *)(iphdr + 1); 1673 1674 if (ipv6_addr_loopback(daddr) || 1675 ipv6_addr_is_multicast(&msg->target)) 1676 goto out; 1677 1678 n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev); 1679 1680 if (n) { 1681 struct vxlan_fdb *f; 1682 struct sk_buff *reply; 1683 1684 if (!(n->nud_state & NUD_CONNECTED)) { 1685 neigh_release(n); 1686 goto out; 1687 } 1688 1689 f = vxlan_find_mac(vxlan, n->ha, vni); 1690 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { 1691 /* bridge-local neighbor */ 1692 neigh_release(n); 1693 goto out; 1694 } 1695 1696 reply = vxlan_na_create(skb, n, 1697 !!(f ? f->flags & NTF_ROUTER : 0)); 1698 1699 neigh_release(n); 1700 1701 if (reply == NULL) 1702 goto out; 1703 1704 if (netif_rx_ni(reply) == NET_RX_DROP) 1705 dev->stats.rx_dropped++; 1706 1707 } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { 1708 union vxlan_addr ipa = { 1709 .sin6.sin6_addr = msg->target, 1710 .sin6.sin6_family = AF_INET6, 1711 }; 1712 1713 vxlan_ip_miss(dev, &ipa); 1714 } 1715 1716out: 1717 consume_skb(skb); 1718 return NETDEV_TX_OK; 1719} 1720#endif 1721 1722static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) 1723{ 1724 struct vxlan_dev *vxlan = netdev_priv(dev); 1725 struct neighbour *n; 1726 1727 if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) 1728 return false; 1729 1730 n = NULL; 1731 switch (ntohs(eth_hdr(skb)->h_proto)) { 1732 case ETH_P_IP: 1733 { 1734 struct iphdr *pip; 1735 1736 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 1737 return false; 1738 pip = ip_hdr(skb); 1739 n = neigh_lookup(&arp_tbl, &pip->daddr, dev); 1740 if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { 1741 union vxlan_addr ipa = { 1742 .sin.sin_addr.s_addr = pip->daddr, 1743 .sin.sin_family = AF_INET, 1744 }; 1745 1746 vxlan_ip_miss(dev, &ipa); 1747 return false; 1748 } 1749 1750 break; 1751 } 1752#if IS_ENABLED(CONFIG_IPV6) 1753 case ETH_P_IPV6: 1754 { 1755 struct ipv6hdr *pip6; 1756 1757 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 1758 return false; 1759 pip6 = ipv6_hdr(skb); 1760 n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev); 1761 if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { 1762 union vxlan_addr ipa = { 1763 .sin6.sin6_addr = pip6->daddr, 1764 .sin6.sin6_family = AF_INET6, 1765 }; 1766 1767 vxlan_ip_miss(dev, &ipa); 1768 return false; 1769 } 1770 1771 break; 1772 } 1773#endif 1774 default: 1775 return false; 1776 } 1777 1778 if (n) { 1779 bool diff; 1780 1781 diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha); 1782 if (diff) { 1783 memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 1784 dev->addr_len); 1785 memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len); 1786 } 1787 neigh_release(n); 1788 return diff; 1789 } 1790 1791 return false; 1792} 1793 1794static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, 1795 struct vxlan_metadata *md) 1796{ 1797 struct vxlanhdr_gbp *gbp; 1798 1799 if (!md->gbp) 1800 return; 1801 1802 gbp = (struct vxlanhdr_gbp *)vxh; 1803 vxh->vx_flags |= VXLAN_HF_GBP; 1804 1805 if (md->gbp & VXLAN_GBP_DONT_LEARN) 1806 gbp->dont_learn = 1; 1807 1808 if (md->gbp & VXLAN_GBP_POLICY_APPLIED) 1809 gbp->policy_applied = 1; 1810 1811 gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); 1812} 1813 1814static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags, 1815 __be16 protocol) 1816{ 1817 struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh; 1818 1819 gpe->np_applied = 1; 1820 gpe->next_protocol = tun_p_from_eth_p(protocol); 1821 if (!gpe->next_protocol) 1822 return -EPFNOSUPPORT; 1823 return 0; 1824} 1825 1826static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, 1827 int iphdr_len, __be32 vni, 1828 struct vxlan_metadata *md, u32 vxflags, 1829 bool udp_sum) 1830{ 1831 struct vxlanhdr *vxh; 1832 int min_headroom; 1833 int err; 1834 int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; 1835 __be16 inner_protocol = htons(ETH_P_TEB); 1836 1837 if ((vxflags & VXLAN_F_REMCSUM_TX) && 1838 skb->ip_summed == CHECKSUM_PARTIAL) { 1839 int csum_start = skb_checksum_start_offset(skb); 1840 1841 if (csum_start <= VXLAN_MAX_REMCSUM_START && 1842 !(csum_start & VXLAN_RCO_SHIFT_MASK) && 1843 (skb->csum_offset == offsetof(struct udphdr, check) || 1844 skb->csum_offset == offsetof(struct tcphdr, check))) 1845 type |= SKB_GSO_TUNNEL_REMCSUM; 1846 } 1847 1848 min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len 1849 + VXLAN_HLEN + iphdr_len; 1850 1851 /* Need space for new headers (invalidates iph ptr) */ 1852 err = skb_cow_head(skb, min_headroom); 1853 if (unlikely(err)) 1854 return err; 1855 1856 err = iptunnel_handle_offloads(skb, type); 1857 if (err) 1858 return err; 1859 1860 vxh = __skb_push(skb, sizeof(*vxh)); 1861 vxh->vx_flags = VXLAN_HF_VNI; 1862 vxh->vx_vni = vxlan_vni_field(vni); 1863 1864 if (type & SKB_GSO_TUNNEL_REMCSUM) { 1865 unsigned int start; 1866 1867 start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr); 1868 vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset); 1869 vxh->vx_flags |= VXLAN_HF_RCO; 1870 1871 if (!skb_is_gso(skb)) { 1872 skb->ip_summed = CHECKSUM_NONE; 1873 skb->encapsulation = 0; 1874 } 1875 } 1876 1877 if (vxflags & VXLAN_F_GBP) 1878 vxlan_build_gbp_hdr(vxh, vxflags, md); 1879 if (vxflags & VXLAN_F_GPE) { 1880 err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol); 1881 if (err < 0) 1882 return err; 1883 inner_protocol = skb->protocol; 1884 } 1885 1886 skb_set_inner_protocol(skb, inner_protocol); 1887 return 0; 1888} 1889 1890static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev, 1891 struct vxlan_sock *sock4, 1892 struct sk_buff *skb, int oif, u8 tos, 1893 __be32 daddr, __be32 *saddr, __be16 dport, __be16 sport, 1894 struct dst_cache *dst_cache, 1895 const struct ip_tunnel_info *info) 1896{ 1897 bool use_cache = ip_tunnel_dst_cache_usable(skb, info); 1898 struct rtable *rt = NULL; 1899 struct flowi4 fl4; 1900 1901 if (!sock4) 1902 return ERR_PTR(-EIO); 1903 1904 if (tos && !info) 1905 use_cache = false; 1906 if (use_cache) { 1907 rt = dst_cache_get_ip4(dst_cache, saddr); 1908 if (rt) 1909 return rt; 1910 } 1911 1912 memset(&fl4, 0, sizeof(fl4)); 1913 fl4.flowi4_oif = oif; 1914 fl4.flowi4_tos = RT_TOS(tos); 1915 fl4.flowi4_mark = skb->mark; 1916 fl4.flowi4_proto = IPPROTO_UDP; 1917 fl4.daddr = daddr; 1918 fl4.saddr = *saddr; 1919 fl4.fl4_dport = dport; 1920 fl4.fl4_sport = sport; 1921 1922 rt = ip_route_output_key(vxlan->net, &fl4); 1923 if (likely(!IS_ERR(rt))) { 1924 if (rt->dst.dev == dev) { 1925 netdev_dbg(dev, "circular route to %pI4\n", &daddr); 1926 ip_rt_put(rt); 1927 return ERR_PTR(-ELOOP); 1928 } 1929 1930 *saddr = fl4.saddr; 1931 if (use_cache) 1932 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); 1933 } else { 1934 netdev_dbg(dev, "no route to %pI4\n", &daddr); 1935 return ERR_PTR(-ENETUNREACH); 1936 } 1937 return rt; 1938} 1939 1940#if IS_ENABLED(CONFIG_IPV6) 1941static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, 1942 struct net_device *dev, 1943 struct vxlan_sock *sock6, 1944 struct sk_buff *skb, int oif, u8 tos, 1945 __be32 label, 1946 const struct in6_addr *daddr, 1947 struct in6_addr *saddr, 1948 __be16 dport, __be16 sport, 1949 struct dst_cache *dst_cache, 1950 const struct ip_tunnel_info *info) 1951{ 1952 bool use_cache = ip_tunnel_dst_cache_usable(skb, info); 1953 struct dst_entry *ndst; 1954 struct flowi6 fl6; 1955 int err; 1956 1957 if (!sock6) 1958 return ERR_PTR(-EIO); 1959 1960 if (tos && !info) 1961 use_cache = false; 1962 if (use_cache) { 1963 ndst = dst_cache_get_ip6(dst_cache, saddr); 1964 if (ndst) 1965 return ndst; 1966 } 1967 1968 memset(&fl6, 0, sizeof(fl6)); 1969 fl6.flowi6_oif = oif; 1970 fl6.daddr = *daddr; 1971 fl6.saddr = *saddr; 1972 fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label); 1973 fl6.flowi6_mark = skb->mark; 1974 fl6.flowi6_proto = IPPROTO_UDP; 1975 fl6.fl6_dport = dport; 1976 fl6.fl6_sport = sport; 1977 1978 err = ipv6_stub->ipv6_dst_lookup(vxlan->net, 1979 sock6->sock->sk, 1980 &ndst, &fl6); 1981 if (unlikely(err < 0)) { 1982 netdev_dbg(dev, "no route to %pI6\n", daddr); 1983 return ERR_PTR(-ENETUNREACH); 1984 } 1985 1986 if (unlikely(ndst->dev == dev)) { 1987 netdev_dbg(dev, "circular route to %pI6\n", daddr); 1988 dst_release(ndst); 1989 return ERR_PTR(-ELOOP); 1990 } 1991 1992 *saddr = fl6.saddr; 1993 if (use_cache) 1994 dst_cache_set_ip6(dst_cache, ndst, saddr); 1995 return ndst; 1996} 1997#endif 1998 1999/* Bypass encapsulation if the destination is local */ 2000static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, 2001 struct vxlan_dev *dst_vxlan, __be32 vni) 2002{ 2003 struct pcpu_sw_netstats *tx_stats, *rx_stats; 2004 union vxlan_addr loopback; 2005 union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; 2006 struct net_device *dev = skb->dev; 2007 int len = skb->len; 2008 2009 tx_stats = this_cpu_ptr(src_vxlan->dev->tstats); 2010 rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats); 2011 skb->pkt_type = PACKET_HOST; 2012 skb->encapsulation = 0; 2013 skb->dev = dst_vxlan->dev; 2014 __skb_pull(skb, skb_network_offset(skb)); 2015 2016 if (remote_ip->sa.sa_family == AF_INET) { 2017 loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2018 loopback.sa.sa_family = AF_INET; 2019#if IS_ENABLED(CONFIG_IPV6) 2020 } else { 2021 loopback.sin6.sin6_addr = in6addr_loopback; 2022 loopback.sa.sa_family = AF_INET6; 2023#endif 2024 } 2025 2026 if (dst_vxlan->cfg.flags & VXLAN_F_LEARN) 2027 vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source, 0, 2028 vni); 2029 2030 u64_stats_update_begin(&tx_stats->syncp); 2031 tx_stats->tx_packets++; 2032 tx_stats->tx_bytes += len; 2033 u64_stats_update_end(&tx_stats->syncp); 2034 2035 if (netif_rx(skb) == NET_RX_SUCCESS) { 2036 u64_stats_update_begin(&rx_stats->syncp); 2037 rx_stats->rx_packets++; 2038 rx_stats->rx_bytes += len; 2039 u64_stats_update_end(&rx_stats->syncp); 2040 } else { 2041 dev->stats.rx_dropped++; 2042 } 2043} 2044 2045static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, 2046 struct vxlan_dev *vxlan, 2047 union vxlan_addr *daddr, 2048 __be16 dst_port, int dst_ifindex, __be32 vni, 2049 struct dst_entry *dst, 2050 u32 rt_flags) 2051{ 2052#if IS_ENABLED(CONFIG_IPV6) 2053 /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of 2054 * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple 2055 * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry. 2056 */ 2057 BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL); 2058#endif 2059 /* Bypass encapsulation if the destination is local */ 2060 if (rt_flags & RTCF_LOCAL && 2061 !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { 2062 struct vxlan_dev *dst_vxlan; 2063 2064 dst_release(dst); 2065 dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, 2066 daddr->sa.sa_family, dst_port, 2067 vxlan->cfg.flags); 2068 if (!dst_vxlan) { 2069 dev->stats.tx_errors++; 2070 kfree_skb(skb); 2071 2072 return -ENOENT; 2073 } 2074 vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni); 2075 return 1; 2076 } 2077 2078 return 0; 2079} 2080 2081static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, 2082 __be32 default_vni, struct vxlan_rdst *rdst, 2083 bool did_rsc) 2084{ 2085 struct dst_cache *dst_cache; 2086 struct ip_tunnel_info *info; 2087 struct vxlan_dev *vxlan = netdev_priv(dev); 2088 const struct iphdr *old_iph = ip_hdr(skb); 2089 union vxlan_addr *dst; 2090 union vxlan_addr remote_ip, local_ip; 2091 struct vxlan_metadata _md; 2092 struct vxlan_metadata *md = &_md; 2093 __be16 src_port = 0, dst_port; 2094 struct dst_entry *ndst = NULL; 2095 __be32 vni, label; 2096 __u8 tos, ttl; 2097 int ifindex; 2098 int err; 2099 u32 flags = vxlan->cfg.flags; 2100 bool udp_sum = false; 2101 bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); 2102 2103 info = skb_tunnel_info(skb); 2104 2105 if (rdst) { 2106 dst = &rdst->remote_ip; 2107 if (vxlan_addr_any(dst)) { 2108 if (did_rsc) { 2109 /* short-circuited back to local bridge */ 2110 vxlan_encap_bypass(skb, vxlan, vxlan, default_vni); 2111 return; 2112 } 2113 goto drop; 2114 } 2115 2116 dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; 2117 vni = (rdst->remote_vni) ? : default_vni; 2118 ifindex = rdst->remote_ifindex; 2119 local_ip = vxlan->cfg.saddr; 2120 dst_cache = &rdst->dst_cache; 2121 md->gbp = skb->mark; 2122 if (flags & VXLAN_F_TTL_INHERIT) { 2123 ttl = ip_tunnel_get_ttl(old_iph, skb); 2124 } else { 2125 ttl = vxlan->cfg.ttl; 2126 if (!ttl && vxlan_addr_multicast(dst)) 2127 ttl = 1; 2128 } 2129 2130 tos = vxlan->cfg.tos; 2131 if (tos == 1) 2132 tos = ip_tunnel_get_dsfield(old_iph, skb); 2133 2134 if (dst->sa.sa_family == AF_INET) 2135 udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); 2136 else 2137 udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); 2138 label = vxlan->cfg.label; 2139 } else { 2140 if (!info) { 2141 WARN_ONCE(1, "%s: Missing encapsulation instructions\n", 2142 dev->name); 2143 goto drop; 2144 } 2145 remote_ip.sa.sa_family = ip_tunnel_info_af(info); 2146 if (remote_ip.sa.sa_family == AF_INET) { 2147 remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; 2148 local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src; 2149 } else { 2150 remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; 2151 local_ip.sin6.sin6_addr = info->key.u.ipv6.src; 2152 } 2153 dst = &remote_ip; 2154 dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; 2155 vni = tunnel_id_to_key32(info->key.tun_id); 2156 ifindex = 0; 2157 dst_cache = &info->dst_cache; 2158 if (info->options_len && 2159 info->key.tun_flags & TUNNEL_VXLAN_OPT) 2160 md = ip_tunnel_info_opts(info); 2161 ttl = info->key.ttl; 2162 tos = info->key.tos; 2163 label = info->key.label; 2164 udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); 2165 } 2166 src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, 2167 vxlan->cfg.port_max, true); 2168 2169 rcu_read_lock(); 2170 if (dst->sa.sa_family == AF_INET) { 2171 struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); 2172 struct rtable *rt; 2173 __be16 df = 0; 2174 2175 rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos, 2176 dst->sin.sin_addr.s_addr, 2177 &local_ip.sin.sin_addr.s_addr, 2178 dst_port, src_port, 2179 dst_cache, info); 2180 if (IS_ERR(rt)) { 2181 err = PTR_ERR(rt); 2182 goto tx_error; 2183 } 2184 2185 /* Bypass encapsulation if the destination is local */ 2186 if (!info) { 2187 err = encap_bypass_if_local(skb, dev, vxlan, dst, 2188 dst_port, ifindex, vni, 2189 &rt->dst, rt->rt_flags); 2190 if (err) 2191 goto out_unlock; 2192 } else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) { 2193 df = htons(IP_DF); 2194 } 2195 2196 ndst = &rt->dst; 2197 if (skb_dst(skb)) { 2198 int mtu = dst_mtu(ndst) - VXLAN_HEADROOM; 2199 2200 skb_dst_update_pmtu(skb, mtu); 2201 } 2202 2203 tos = ip_tunnel_ecn_encap(tos, old_iph, skb); 2204 ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); 2205 err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), 2206 vni, md, flags, udp_sum); 2207 if (err < 0) 2208 goto tx_error; 2209 2210 udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr, 2211 dst->sin.sin_addr.s_addr, tos, ttl, df, 2212 src_port, dst_port, xnet, !udp_sum); 2213#if IS_ENABLED(CONFIG_IPV6) 2214 } else { 2215 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); 2216 2217 ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos, 2218 label, &dst->sin6.sin6_addr, 2219 &local_ip.sin6.sin6_addr, 2220 dst_port, src_port, 2221 dst_cache, info); 2222 if (IS_ERR(ndst)) { 2223 err = PTR_ERR(ndst); 2224 ndst = NULL; 2225 goto tx_error; 2226 } 2227 2228 if (!info) { 2229 u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; 2230 2231 err = encap_bypass_if_local(skb, dev, vxlan, dst, 2232 dst_port, ifindex, vni, 2233 ndst, rt6i_flags); 2234 if (err) 2235 goto out_unlock; 2236 } 2237 2238 if (skb_dst(skb)) { 2239 int mtu = dst_mtu(ndst) - VXLAN6_HEADROOM; 2240 2241 skb_dst_update_pmtu(skb, mtu); 2242 } 2243 2244 tos = ip_tunnel_ecn_encap(tos, old_iph, skb); 2245 ttl = ttl ? : ip6_dst_hoplimit(ndst); 2246 skb_scrub_packet(skb, xnet); 2247 err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), 2248 vni, md, flags, udp_sum); 2249 if (err < 0) 2250 goto tx_error; 2251 2252 udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, 2253 &local_ip.sin6.sin6_addr, 2254 &dst->sin6.sin6_addr, tos, ttl, 2255 label, src_port, dst_port, !udp_sum); 2256#endif 2257 } 2258out_unlock: 2259 rcu_read_unlock(); 2260 return; 2261 2262drop: 2263 dev->stats.tx_dropped++; 2264 dev_kfree_skb(skb); 2265 return; 2266 2267tx_error: 2268 rcu_read_unlock(); 2269 if (err == -ELOOP) 2270 dev->stats.collisions++; 2271 else if (err == -ENETUNREACH) 2272 dev->stats.tx_carrier_errors++; 2273 dst_release(ndst); 2274 dev->stats.tx_errors++; 2275 kfree_skb(skb); 2276} 2277 2278/* Transmit local packets over Vxlan 2279 * 2280 * Outer IP header inherits ECN and DF from inner header. 2281 * Outer UDP destination is the VXLAN assigned port. 2282 * source port is based on hash of flow 2283 */ 2284static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) 2285{ 2286 struct vxlan_dev *vxlan = netdev_priv(dev); 2287 struct vxlan_rdst *rdst, *fdst = NULL; 2288 const struct ip_tunnel_info *info; 2289 bool did_rsc = false; 2290 struct vxlan_fdb *f; 2291 struct ethhdr *eth; 2292 __be32 vni = 0; 2293 2294 info = skb_tunnel_info(skb); 2295 2296 skb_reset_mac_header(skb); 2297 2298 if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { 2299 if (info && info->mode & IP_TUNNEL_INFO_BRIDGE && 2300 info->mode & IP_TUNNEL_INFO_TX) { 2301 vni = tunnel_id_to_key32(info->key.tun_id); 2302 } else { 2303 if (info && info->mode & IP_TUNNEL_INFO_TX) 2304 vxlan_xmit_one(skb, dev, vni, NULL, false); 2305 else 2306 kfree_skb(skb); 2307 return NETDEV_TX_OK; 2308 } 2309 } 2310 2311 if (vxlan->cfg.flags & VXLAN_F_PROXY) { 2312 eth = eth_hdr(skb); 2313 if (ntohs(eth->h_proto) == ETH_P_ARP) 2314 return arp_reduce(dev, skb, vni); 2315#if IS_ENABLED(CONFIG_IPV6) 2316 else if (ntohs(eth->h_proto) == ETH_P_IPV6 && 2317 pskb_may_pull(skb, sizeof(struct ipv6hdr) + 2318 sizeof(struct nd_msg)) && 2319 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { 2320 struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1); 2321 2322 if (m->icmph.icmp6_code == 0 && 2323 m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) 2324 return neigh_reduce(dev, skb, vni); 2325 } 2326#endif 2327 } 2328 2329 eth = eth_hdr(skb); 2330 f = vxlan_find_mac(vxlan, eth->h_dest, vni); 2331 did_rsc = false; 2332 2333 if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) && 2334 (ntohs(eth->h_proto) == ETH_P_IP || 2335 ntohs(eth->h_proto) == ETH_P_IPV6)) { 2336 did_rsc = route_shortcircuit(dev, skb); 2337 if (did_rsc) 2338 f = vxlan_find_mac(vxlan, eth->h_dest, vni); 2339 } 2340 2341 if (f == NULL) { 2342 f = vxlan_find_mac(vxlan, all_zeros_mac, vni); 2343 if (f == NULL) { 2344 if ((vxlan->cfg.flags & VXLAN_F_L2MISS) && 2345 !is_multicast_ether_addr(eth->h_dest)) 2346 vxlan_fdb_miss(vxlan, eth->h_dest); 2347 2348 dev->stats.tx_dropped++; 2349 kfree_skb(skb); 2350 return NETDEV_TX_OK; 2351 } 2352 } 2353 2354 list_for_each_entry_rcu(rdst, &f->remotes, list) { 2355 struct sk_buff *skb1; 2356 2357 if (!fdst) { 2358 fdst = rdst; 2359 continue; 2360 } 2361 skb1 = skb_clone(skb, GFP_ATOMIC); 2362 if (skb1) 2363 vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); 2364 } 2365 2366 if (fdst) 2367 vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); 2368 else 2369 kfree_skb(skb); 2370 return NETDEV_TX_OK; 2371} 2372 2373/* Walk the forwarding table and purge stale entries */ 2374static void vxlan_cleanup(struct timer_list *t) 2375{ 2376 struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); 2377 unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; 2378 unsigned int h; 2379 2380 if (!netif_running(vxlan->dev)) 2381 return; 2382 2383 for (h = 0; h < FDB_HASH_SIZE; ++h) { 2384 struct hlist_node *p, *n; 2385 2386 spin_lock_bh(&vxlan->hash_lock); 2387 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { 2388 struct vxlan_fdb *f 2389 = container_of(p, struct vxlan_fdb, hlist); 2390 unsigned long timeout; 2391 2392 if (f->state & (NUD_PERMANENT | NUD_NOARP)) 2393 continue; 2394 2395 if (f->flags & NTF_EXT_LEARNED) 2396 continue; 2397 2398 timeout = f->used + vxlan->cfg.age_interval * HZ; 2399 if (time_before_eq(timeout, jiffies)) { 2400 netdev_dbg(vxlan->dev, 2401 "garbage collect %pM\n", 2402 f->eth_addr); 2403 f->state = NUD_STALE; 2404 vxlan_fdb_destroy(vxlan, f, true); 2405 } else if (time_before(timeout, next_timer)) 2406 next_timer = timeout; 2407 } 2408 spin_unlock_bh(&vxlan->hash_lock); 2409 } 2410 2411 mod_timer(&vxlan->age_timer, next_timer); 2412} 2413 2414static void vxlan_vs_del_dev(struct vxlan_dev *vxlan) 2415{ 2416 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2417 2418 spin_lock(&vn->sock_lock); 2419 hlist_del_init_rcu(&vxlan->hlist4.hlist); 2420#if IS_ENABLED(CONFIG_IPV6) 2421 hlist_del_init_rcu(&vxlan->hlist6.hlist); 2422#endif 2423 spin_unlock(&vn->sock_lock); 2424} 2425 2426static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan, 2427 struct vxlan_dev_node *node) 2428{ 2429 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2430 __be32 vni = vxlan->default_dst.remote_vni; 2431 2432 node->vxlan = vxlan; 2433 spin_lock(&vn->sock_lock); 2434 hlist_add_head_rcu(&node->hlist, vni_head(vs, vni)); 2435 spin_unlock(&vn->sock_lock); 2436} 2437 2438/* Setup stats when device is created */ 2439static int vxlan_init(struct net_device *dev) 2440{ 2441 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 2442 if (!dev->tstats) 2443 return -ENOMEM; 2444 2445 return 0; 2446} 2447 2448static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni) 2449{ 2450 struct vxlan_fdb *f; 2451 2452 spin_lock_bh(&vxlan->hash_lock); 2453 f = __vxlan_find_mac(vxlan, all_zeros_mac, vni); 2454 if (f) 2455 vxlan_fdb_destroy(vxlan, f, true); 2456 spin_unlock_bh(&vxlan->hash_lock); 2457} 2458 2459static void vxlan_uninit(struct net_device *dev) 2460{ 2461 struct vxlan_dev *vxlan = netdev_priv(dev); 2462 2463 vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); 2464 2465 free_percpu(dev->tstats); 2466} 2467 2468/* Start ageing timer and join group when device is brought up */ 2469static int vxlan_open(struct net_device *dev) 2470{ 2471 struct vxlan_dev *vxlan = netdev_priv(dev); 2472 int ret; 2473 2474 ret = vxlan_sock_add(vxlan); 2475 if (ret < 0) 2476 return ret; 2477 2478 if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { 2479 ret = vxlan_igmp_join(vxlan); 2480 if (ret == -EADDRINUSE) 2481 ret = 0; 2482 if (ret) { 2483 vxlan_sock_release(vxlan); 2484 return ret; 2485 } 2486 } 2487 2488 if (vxlan->cfg.age_interval) 2489 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); 2490 2491 return ret; 2492} 2493 2494/* Purge the forwarding table */ 2495static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all) 2496{ 2497 unsigned int h; 2498 2499 spin_lock_bh(&vxlan->hash_lock); 2500 for (h = 0; h < FDB_HASH_SIZE; ++h) { 2501 struct hlist_node *p, *n; 2502 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { 2503 struct vxlan_fdb *f 2504 = container_of(p, struct vxlan_fdb, hlist); 2505 if (!do_all && (f->state & (NUD_PERMANENT | NUD_NOARP))) 2506 continue; 2507 /* the all_zeros_mac entry is deleted at vxlan_uninit */ 2508 if (!is_zero_ether_addr(f->eth_addr)) 2509 vxlan_fdb_destroy(vxlan, f, true); 2510 } 2511 } 2512 spin_unlock_bh(&vxlan->hash_lock); 2513} 2514 2515/* Cleanup timer and forwarding table on shutdown */ 2516static int vxlan_stop(struct net_device *dev) 2517{ 2518 struct vxlan_dev *vxlan = netdev_priv(dev); 2519 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2520 int ret = 0; 2521 2522 if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && 2523 !vxlan_group_used(vn, vxlan)) 2524 ret = vxlan_igmp_leave(vxlan); 2525 2526 del_timer_sync(&vxlan->age_timer); 2527 2528 vxlan_flush(vxlan, false); 2529 vxlan_sock_release(vxlan); 2530 2531 return ret; 2532} 2533 2534/* Stub, nothing needs to be done. */ 2535static void vxlan_set_multicast_list(struct net_device *dev) 2536{ 2537} 2538 2539static int vxlan_change_mtu(struct net_device *dev, int new_mtu) 2540{ 2541 struct vxlan_dev *vxlan = netdev_priv(dev); 2542 struct vxlan_rdst *dst = &vxlan->default_dst; 2543 struct net_device *lowerdev = __dev_get_by_index(vxlan->net, 2544 dst->remote_ifindex); 2545 bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6); 2546 2547 /* This check is different than dev->max_mtu, because it looks at 2548 * the lowerdev->mtu, rather than the static dev->max_mtu 2549 */ 2550 if (lowerdev) { 2551 int max_mtu = lowerdev->mtu - 2552 (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); 2553 if (new_mtu > max_mtu) 2554 return -EINVAL; 2555 } 2556 2557 dev->mtu = new_mtu; 2558 return 0; 2559} 2560 2561static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) 2562{ 2563 struct vxlan_dev *vxlan = netdev_priv(dev); 2564 struct ip_tunnel_info *info = skb_tunnel_info(skb); 2565 __be16 sport, dport; 2566 2567 sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, 2568 vxlan->cfg.port_max, true); 2569 dport = info->key.tp_dst ? : vxlan->cfg.dst_port; 2570 2571 if (ip_tunnel_info_af(info) == AF_INET) { 2572 struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); 2573 struct rtable *rt; 2574 2575 rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos, 2576 info->key.u.ipv4.dst, 2577 &info->key.u.ipv4.src, dport, sport, 2578 &info->dst_cache, info); 2579 if (IS_ERR(rt)) 2580 return PTR_ERR(rt); 2581 ip_rt_put(rt); 2582 } else { 2583#if IS_ENABLED(CONFIG_IPV6) 2584 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); 2585 struct dst_entry *ndst; 2586 2587 ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos, 2588 info->key.label, &info->key.u.ipv6.dst, 2589 &info->key.u.ipv6.src, dport, sport, 2590 &info->dst_cache, info); 2591 if (IS_ERR(ndst)) 2592 return PTR_ERR(ndst); 2593 dst_release(ndst); 2594#else /* !CONFIG_IPV6 */ 2595 return -EPFNOSUPPORT; 2596#endif 2597 } 2598 info->key.tp_src = sport; 2599 info->key.tp_dst = dport; 2600 return 0; 2601} 2602 2603static const struct net_device_ops vxlan_netdev_ether_ops = { 2604 .ndo_init = vxlan_init, 2605 .ndo_uninit = vxlan_uninit, 2606 .ndo_open = vxlan_open, 2607 .ndo_stop = vxlan_stop, 2608 .ndo_start_xmit = vxlan_xmit, 2609 .ndo_get_stats64 = ip_tunnel_get_stats64, 2610 .ndo_set_rx_mode = vxlan_set_multicast_list, 2611 .ndo_change_mtu = vxlan_change_mtu, 2612 .ndo_validate_addr = eth_validate_addr, 2613 .ndo_set_mac_address = eth_mac_addr, 2614 .ndo_fdb_add = vxlan_fdb_add, 2615 .ndo_fdb_del = vxlan_fdb_delete, 2616 .ndo_fdb_dump = vxlan_fdb_dump, 2617 .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, 2618}; 2619 2620static const struct net_device_ops vxlan_netdev_raw_ops = { 2621 .ndo_init = vxlan_init, 2622 .ndo_uninit = vxlan_uninit, 2623 .ndo_open = vxlan_open, 2624 .ndo_stop = vxlan_stop, 2625 .ndo_start_xmit = vxlan_xmit, 2626 .ndo_get_stats64 = ip_tunnel_get_stats64, 2627 .ndo_change_mtu = vxlan_change_mtu, 2628 .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, 2629}; 2630 2631/* Info for udev, that this is a virtual tunnel endpoint */ 2632static struct device_type vxlan_type = { 2633 .name = "vxlan", 2634}; 2635 2636/* Calls the ndo_udp_tunnel_add of the caller in order to 2637 * supply the listening VXLAN udp ports. Callers are expected 2638 * to implement the ndo_udp_tunnel_add. 2639 */ 2640static void vxlan_offload_rx_ports(struct net_device *dev, bool push) 2641{ 2642 struct vxlan_sock *vs; 2643 struct net *net = dev_net(dev); 2644 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2645 unsigned int i; 2646 2647 spin_lock(&vn->sock_lock); 2648 for (i = 0; i < PORT_HASH_SIZE; ++i) { 2649 hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { 2650 unsigned short type; 2651 2652 if (vs->flags & VXLAN_F_GPE) 2653 type = UDP_TUNNEL_TYPE_VXLAN_GPE; 2654 else 2655 type = UDP_TUNNEL_TYPE_VXLAN; 2656 2657 if (push) 2658 udp_tunnel_push_rx_port(dev, vs->sock, type); 2659 else 2660 udp_tunnel_drop_rx_port(dev, vs->sock, type); 2661 } 2662 } 2663 spin_unlock(&vn->sock_lock); 2664} 2665 2666/* Initialize the device structure. */ 2667static void vxlan_setup(struct net_device *dev) 2668{ 2669 struct vxlan_dev *vxlan = netdev_priv(dev); 2670 unsigned int h; 2671 2672 eth_hw_addr_random(dev); 2673 ether_setup(dev); 2674 2675 dev->needs_free_netdev = true; 2676 SET_NETDEV_DEVTYPE(dev, &vxlan_type); 2677 2678 dev->features |= NETIF_F_LLTX; 2679 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; 2680 dev->features |= NETIF_F_RXCSUM; 2681 dev->features |= NETIF_F_GSO_SOFTWARE; 2682 2683 dev->vlan_features = dev->features; 2684 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; 2685 dev->hw_features |= NETIF_F_GSO_SOFTWARE; 2686 netif_keep_dst(dev); 2687 dev->priv_flags |= IFF_NO_QUEUE; 2688 2689 /* MTU range: 68 - 65535 */ 2690 dev->min_mtu = ETH_MIN_MTU; 2691 dev->max_mtu = ETH_MAX_MTU; 2692 2693 INIT_LIST_HEAD(&vxlan->next); 2694 spin_lock_init(&vxlan->hash_lock); 2695 2696 timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); 2697 2698 vxlan->dev = dev; 2699 2700 gro_cells_init(&vxlan->gro_cells, dev); 2701 2702 for (h = 0; h < FDB_HASH_SIZE; ++h) 2703 INIT_HLIST_HEAD(&vxlan->fdb_head[h]); 2704} 2705 2706static void vxlan_ether_setup(struct net_device *dev) 2707{ 2708 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 2709 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 2710 dev->netdev_ops = &vxlan_netdev_ether_ops; 2711} 2712 2713static void vxlan_raw_setup(struct net_device *dev) 2714{ 2715 dev->header_ops = NULL; 2716 dev->type = ARPHRD_NONE; 2717 dev->hard_header_len = 0; 2718 dev->addr_len = 0; 2719 dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; 2720 dev->netdev_ops = &vxlan_netdev_raw_ops; 2721} 2722 2723static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { 2724 [IFLA_VXLAN_ID] = { .type = NLA_U32 }, 2725 [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, 2726 [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, 2727 [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, 2728 [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, 2729 [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, 2730 [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, 2731 [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, 2732 [IFLA_VXLAN_LABEL] = { .type = NLA_U32 }, 2733 [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, 2734 [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, 2735 [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, 2736 [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, 2737 [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, 2738 [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, 2739 [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, 2740 [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, 2741 [IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 }, 2742 [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, 2743 [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, 2744 [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, 2745 [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, 2746 [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, 2747 [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, 2748 [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, 2749 [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, 2750 [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, 2751 [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, 2752}; 2753 2754static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], 2755 struct netlink_ext_ack *extack) 2756{ 2757 if (tb[IFLA_ADDRESS]) { 2758 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { 2759 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], 2760 "Provided link layer address is not Ethernet"); 2761 return -EINVAL; 2762 } 2763 2764 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { 2765 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], 2766 "Provided Ethernet address is not unicast"); 2767 return -EADDRNOTAVAIL; 2768 } 2769 } 2770 2771 if (tb[IFLA_MTU]) { 2772 u32 mtu = nla_get_u32(tb[IFLA_MTU]); 2773 2774 if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) { 2775 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], 2776 "MTU must be between 68 and 65535"); 2777 return -EINVAL; 2778 } 2779 } 2780 2781 if (!data) { 2782 NL_SET_ERR_MSG(extack, 2783 "Required attributes not provided to perform the operation"); 2784 return -EINVAL; 2785 } 2786 2787 if (data[IFLA_VXLAN_ID]) { 2788 u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); 2789 2790 if (id >= VXLAN_N_VID) { 2791 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], 2792 "VXLAN ID must be lower than 16777216"); 2793 return -ERANGE; 2794 } 2795 } 2796 2797 if (data[IFLA_VXLAN_PORT_RANGE]) { 2798 const struct ifla_vxlan_port_range *p 2799 = nla_data(data[IFLA_VXLAN_PORT_RANGE]); 2800 2801 if (ntohs(p->high) < ntohs(p->low)) { 2802 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE], 2803 "Invalid source port range"); 2804 return -EINVAL; 2805 } 2806 } 2807 2808 return 0; 2809} 2810 2811static void vxlan_get_drvinfo(struct net_device *netdev, 2812 struct ethtool_drvinfo *drvinfo) 2813{ 2814 strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); 2815 strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); 2816} 2817 2818static const struct ethtool_ops vxlan_ethtool_ops = { 2819 .get_drvinfo = vxlan_get_drvinfo, 2820 .get_link = ethtool_op_get_link, 2821}; 2822 2823static struct socket *vxlan_create_sock(struct net *net, bool ipv6, 2824 __be16 port, u32 flags) 2825{ 2826 struct socket *sock; 2827 struct udp_port_cfg udp_conf; 2828 int err; 2829 2830 memset(&udp_conf, 0, sizeof(udp_conf)); 2831 2832 if (ipv6) { 2833 udp_conf.family = AF_INET6; 2834 udp_conf.use_udp6_rx_checksums = 2835 !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); 2836 udp_conf.ipv6_v6only = 1; 2837 } else { 2838 udp_conf.family = AF_INET; 2839 } 2840 2841 udp_conf.local_udp_port = port; 2842 2843 /* Open UDP socket */ 2844 err = udp_sock_create(net, &udp_conf, &sock); 2845 if (err < 0) 2846 return ERR_PTR(err); 2847 2848 return sock; 2849} 2850 2851/* Create new listen socket if needed */ 2852static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, 2853 __be16 port, u32 flags) 2854{ 2855 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2856 struct vxlan_sock *vs; 2857 struct socket *sock; 2858 unsigned int h; 2859 struct udp_tunnel_sock_cfg tunnel_cfg; 2860 2861 vs = kzalloc(sizeof(*vs), GFP_KERNEL); 2862 if (!vs) 2863 return ERR_PTR(-ENOMEM); 2864 2865 for (h = 0; h < VNI_HASH_SIZE; ++h) 2866 INIT_HLIST_HEAD(&vs->vni_list[h]); 2867 2868 sock = vxlan_create_sock(net, ipv6, port, flags); 2869 if (IS_ERR(sock)) { 2870 kfree(vs); 2871 return ERR_CAST(sock); 2872 } 2873 2874 vs->sock = sock; 2875 refcount_set(&vs->refcnt, 1); 2876 vs->flags = (flags & VXLAN_F_RCV_FLAGS); 2877 2878 spin_lock(&vn->sock_lock); 2879 hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); 2880 udp_tunnel_notify_add_rx_port(sock, 2881 (vs->flags & VXLAN_F_GPE) ? 2882 UDP_TUNNEL_TYPE_VXLAN_GPE : 2883 UDP_TUNNEL_TYPE_VXLAN); 2884 spin_unlock(&vn->sock_lock); 2885 2886 /* Mark socket as an encapsulation socket. */ 2887 memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); 2888 tunnel_cfg.sk_user_data = vs; 2889 tunnel_cfg.encap_type = 1; 2890 tunnel_cfg.encap_rcv = vxlan_rcv; 2891 tunnel_cfg.encap_destroy = NULL; 2892 tunnel_cfg.gro_receive = vxlan_gro_receive; 2893 tunnel_cfg.gro_complete = vxlan_gro_complete; 2894 2895 setup_udp_tunnel_sock(net, sock, &tunnel_cfg); 2896 2897 return vs; 2898} 2899 2900static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) 2901{ 2902 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2903 struct vxlan_sock *vs = NULL; 2904 struct vxlan_dev_node *node; 2905 2906 if (!vxlan->cfg.no_share) { 2907 spin_lock(&vn->sock_lock); 2908 vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, 2909 vxlan->cfg.dst_port, vxlan->cfg.flags); 2910 if (vs && !refcount_inc_not_zero(&vs->refcnt)) { 2911 spin_unlock(&vn->sock_lock); 2912 return -EBUSY; 2913 } 2914 spin_unlock(&vn->sock_lock); 2915 } 2916 if (!vs) 2917 vs = vxlan_socket_create(vxlan->net, ipv6, 2918 vxlan->cfg.dst_port, vxlan->cfg.flags); 2919 if (IS_ERR(vs)) 2920 return PTR_ERR(vs); 2921#if IS_ENABLED(CONFIG_IPV6) 2922 if (ipv6) { 2923 rcu_assign_pointer(vxlan->vn6_sock, vs); 2924 node = &vxlan->hlist6; 2925 } else 2926#endif 2927 { 2928 rcu_assign_pointer(vxlan->vn4_sock, vs); 2929 node = &vxlan->hlist4; 2930 } 2931 vxlan_vs_add_dev(vs, vxlan, node); 2932 return 0; 2933} 2934 2935static int vxlan_sock_add(struct vxlan_dev *vxlan) 2936{ 2937 bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; 2938 bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata; 2939 bool ipv4 = !ipv6 || metadata; 2940 int ret = 0; 2941 2942 RCU_INIT_POINTER(vxlan->vn4_sock, NULL); 2943#if IS_ENABLED(CONFIG_IPV6) 2944 RCU_INIT_POINTER(vxlan->vn6_sock, NULL); 2945 if (ipv6) { 2946 ret = __vxlan_sock_add(vxlan, true); 2947 if (ret < 0 && ret != -EAFNOSUPPORT) 2948 ipv4 = false; 2949 } 2950#endif 2951 if (ipv4) 2952 ret = __vxlan_sock_add(vxlan, false); 2953 if (ret < 0) 2954 vxlan_sock_release(vxlan); 2955 return ret; 2956} 2957 2958static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, 2959 struct net_device **lower, 2960 struct vxlan_dev *old, 2961 struct netlink_ext_ack *extack) 2962{ 2963 struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); 2964 struct vxlan_dev *tmp; 2965 bool use_ipv6 = false; 2966 2967 if (conf->flags & VXLAN_F_GPE) { 2968 /* For now, allow GPE only together with 2969 * COLLECT_METADATA. This can be relaxed later; in such 2970 * case, the other side of the PtP link will have to be 2971 * provided. 2972 */ 2973 if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) || 2974 !(conf->flags & VXLAN_F_COLLECT_METADATA)) { 2975 NL_SET_ERR_MSG(extack, 2976 "VXLAN GPE does not support this combination of attributes"); 2977 return -EINVAL; 2978 } 2979 } 2980 2981 if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) { 2982 /* Unless IPv6 is explicitly requested, assume IPv4 */ 2983 conf->remote_ip.sa.sa_family = AF_INET; 2984 conf->saddr.sa.sa_family = AF_INET; 2985 } else if (!conf->remote_ip.sa.sa_family) { 2986 conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family; 2987 } else if (!conf->saddr.sa.sa_family) { 2988 conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family; 2989 } 2990 2991 if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) { 2992 NL_SET_ERR_MSG(extack, 2993 "Local and remote address must be from the same family"); 2994 return -EINVAL; 2995 } 2996 2997 if (vxlan_addr_multicast(&conf->saddr)) { 2998 NL_SET_ERR_MSG(extack, "Local address cannot be multicast"); 2999 return -EINVAL; 3000 } 3001 3002 if (conf->saddr.sa.sa_family == AF_INET6) { 3003 if (!IS_ENABLED(CONFIG_IPV6)) { 3004 NL_SET_ERR_MSG(extack, 3005 "IPv6 support not enabled in the kernel"); 3006 return -EPFNOSUPPORT; 3007 } 3008 use_ipv6 = true; 3009 conf->flags |= VXLAN_F_IPV6; 3010 3011 if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) { 3012 int local_type = 3013 ipv6_addr_type(&conf->saddr.sin6.sin6_addr); 3014 int remote_type = 3015 ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr); 3016 3017 if (local_type & IPV6_ADDR_LINKLOCAL) { 3018 if (!(remote_type & IPV6_ADDR_LINKLOCAL) && 3019 (remote_type != IPV6_ADDR_ANY)) { 3020 NL_SET_ERR_MSG(extack, 3021 "Invalid combination of local and remote address scopes"); 3022 return -EINVAL; 3023 } 3024 3025 conf->flags |= VXLAN_F_IPV6_LINKLOCAL; 3026 } else { 3027 if (remote_type == 3028 (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) { 3029 NL_SET_ERR_MSG(extack, 3030 "Invalid combination of local and remote address scopes"); 3031 return -EINVAL; 3032 } 3033 3034 conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL; 3035 } 3036 } 3037 } 3038 3039 if (conf->label && !use_ipv6) { 3040 NL_SET_ERR_MSG(extack, 3041 "Label attribute only applies to IPv6 VXLAN devices"); 3042 return -EINVAL; 3043 } 3044 3045 if (conf->remote_ifindex) { 3046 struct net_device *lowerdev; 3047 3048 lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex); 3049 if (!lowerdev) { 3050 NL_SET_ERR_MSG(extack, 3051 "Invalid local interface, device not found"); 3052 return -ENODEV; 3053 } 3054 3055#if IS_ENABLED(CONFIG_IPV6) 3056 if (use_ipv6) { 3057 struct inet6_dev *idev = __in6_dev_get(lowerdev); 3058 if (idev && idev->cnf.disable_ipv6) { 3059 NL_SET_ERR_MSG(extack, 3060 "IPv6 support disabled by administrator"); 3061 return -EPERM; 3062 } 3063 } 3064#endif 3065 3066 *lower = lowerdev; 3067 } else { 3068 if (vxlan_addr_multicast(&conf->remote_ip)) { 3069 NL_SET_ERR_MSG(extack, 3070 "Local interface required for multicast remote destination"); 3071 3072 return -EINVAL; 3073 } 3074 3075#if IS_ENABLED(CONFIG_IPV6) 3076 if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) { 3077 NL_SET_ERR_MSG(extack, 3078 "Local interface required for link-local local/remote addresses"); 3079 return -EINVAL; 3080 } 3081#endif 3082 3083 *lower = NULL; 3084 } 3085 3086 if (!conf->dst_port) { 3087 if (conf->flags & VXLAN_F_GPE) 3088 conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */ 3089 else 3090 conf->dst_port = htons(vxlan_port); 3091 } 3092 3093 if (!conf->age_interval) 3094 conf->age_interval = FDB_AGE_DEFAULT; 3095 3096 list_for_each_entry(tmp, &vn->vxlan_list, next) { 3097 if (tmp == old) 3098 continue; 3099 3100 if (tmp->cfg.vni != conf->vni) 3101 continue; 3102 if (tmp->cfg.dst_port != conf->dst_port) 3103 continue; 3104 if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) != 3105 (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6))) 3106 continue; 3107 3108 if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) && 3109 tmp->cfg.remote_ifindex != conf->remote_ifindex) 3110 continue; 3111 3112 NL_SET_ERR_MSG(extack, 3113 "A VXLAN device with the specified VNI already exists"); 3114 return -EEXIST; 3115 } 3116 3117 return 0; 3118} 3119 3120static void vxlan_config_apply(struct net_device *dev, 3121 struct vxlan_config *conf, 3122 struct net_device *lowerdev, 3123 struct net *src_net, 3124 bool changelink) 3125{ 3126 struct vxlan_dev *vxlan = netdev_priv(dev); 3127 struct vxlan_rdst *dst = &vxlan->default_dst; 3128 unsigned short needed_headroom = ETH_HLEN; 3129 bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6); 3130 int max_mtu = ETH_MAX_MTU; 3131 3132 if (!changelink) { 3133 if (conf->flags & VXLAN_F_GPE) 3134 vxlan_raw_setup(dev); 3135 else 3136 vxlan_ether_setup(dev); 3137 3138 if (conf->mtu) 3139 dev->mtu = conf->mtu; 3140 3141 vxlan->net = src_net; 3142 } 3143 3144 dst->remote_vni = conf->vni; 3145 3146 memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip)); 3147 3148 if (lowerdev) { 3149 dst->remote_ifindex = conf->remote_ifindex; 3150 3151 dev->gso_max_size = lowerdev->gso_max_size; 3152 dev->gso_max_segs = lowerdev->gso_max_segs; 3153 3154 needed_headroom = lowerdev->hard_header_len; 3155 3156 max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : 3157 VXLAN_HEADROOM); 3158 if (max_mtu < ETH_MIN_MTU) 3159 max_mtu = ETH_MIN_MTU; 3160 3161 if (!changelink && !conf->mtu) 3162 dev->mtu = max_mtu; 3163 } 3164 3165 if (dev->mtu > max_mtu) 3166 dev->mtu = max_mtu; 3167 3168 if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA) 3169 needed_headroom += VXLAN6_HEADROOM; 3170 else 3171 needed_headroom += VXLAN_HEADROOM; 3172 dev->needed_headroom = needed_headroom; 3173 3174 memcpy(&vxlan->cfg, conf, sizeof(*conf)); 3175} 3176 3177static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, 3178 struct vxlan_config *conf, bool changelink, 3179 struct netlink_ext_ack *extack) 3180{ 3181 struct vxlan_dev *vxlan = netdev_priv(dev); 3182 struct net_device *lowerdev; 3183 int ret; 3184 3185 ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack); 3186 if (ret) 3187 return ret; 3188 3189 vxlan_config_apply(dev, conf, lowerdev, src_net, changelink); 3190 3191 return 0; 3192} 3193 3194static int __vxlan_dev_create(struct net *net, struct net_device *dev, 3195 struct vxlan_config *conf, 3196 struct netlink_ext_ack *extack) 3197{ 3198 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 3199 struct vxlan_dev *vxlan = netdev_priv(dev); 3200 struct vxlan_fdb *f = NULL; 3201 int err; 3202 3203 err = vxlan_dev_configure(net, dev, conf, false, extack); 3204 if (err) 3205 return err; 3206 3207 dev->ethtool_ops = &vxlan_ethtool_ops; 3208 3209 /* create an fdb entry for a valid default destination */ 3210 if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { 3211 err = vxlan_fdb_create(vxlan, all_zeros_mac, 3212 &vxlan->default_dst.remote_ip, 3213 NUD_REACHABLE | NUD_PERMANENT, 3214 vxlan->cfg.dst_port, 3215 vxlan->default_dst.remote_vni, 3216 vxlan->default_dst.remote_vni, 3217 vxlan->default_dst.remote_ifindex, 3218 NTF_SELF, &f); 3219 if (err) 3220 return err; 3221 } 3222 3223 err = register_netdevice(dev); 3224 if (err) 3225 goto errout; 3226 3227 err = rtnl_configure_link(dev, NULL); 3228 if (err) { 3229 unregister_netdevice(dev); 3230 goto errout; 3231 } 3232 3233 /* notify default fdb entry */ 3234 if (f) 3235 vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH); 3236 3237 list_add(&vxlan->next, &vn->vxlan_list); 3238 return 0; 3239errout: 3240 if (f) 3241 vxlan_fdb_destroy(vxlan, f, false); 3242 return err; 3243} 3244 3245static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], 3246 struct net_device *dev, struct vxlan_config *conf, 3247 bool changelink) 3248{ 3249 struct vxlan_dev *vxlan = netdev_priv(dev); 3250 3251 memset(conf, 0, sizeof(*conf)); 3252 3253 /* if changelink operation, start with old existing cfg */ 3254 if (changelink) 3255 memcpy(conf, &vxlan->cfg, sizeof(*conf)); 3256 3257 if (data[IFLA_VXLAN_ID]) { 3258 __be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); 3259 3260 if (changelink && (vni != conf->vni)) 3261 return -EOPNOTSUPP; 3262 conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); 3263 } 3264 3265 if (data[IFLA_VXLAN_GROUP]) { 3266 if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) 3267 return -EOPNOTSUPP; 3268 3269 conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]); 3270 conf->remote_ip.sa.sa_family = AF_INET; 3271 } else if (data[IFLA_VXLAN_GROUP6]) { 3272 if (!IS_ENABLED(CONFIG_IPV6)) 3273 return -EPFNOSUPPORT; 3274 3275 if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) 3276 return -EOPNOTSUPP; 3277 3278 conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]); 3279 conf->remote_ip.sa.sa_family = AF_INET6; 3280 } 3281 3282 if (data[IFLA_VXLAN_LOCAL]) { 3283 if (changelink && (conf->saddr.sa.sa_family != AF_INET)) 3284 return -EOPNOTSUPP; 3285 3286 conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]); 3287 conf->saddr.sa.sa_family = AF_INET; 3288 } else if (data[IFLA_VXLAN_LOCAL6]) { 3289 if (!IS_ENABLED(CONFIG_IPV6)) 3290 return -EPFNOSUPPORT; 3291 3292 if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) 3293 return -EOPNOTSUPP; 3294 3295 /* TODO: respect scope id */ 3296 conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]); 3297 conf->saddr.sa.sa_family = AF_INET6; 3298 } 3299 3300 if (data[IFLA_VXLAN_LINK]) 3301 conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]); 3302 3303 if (data[IFLA_VXLAN_TOS]) 3304 conf->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); 3305 3306 if (data[IFLA_VXLAN_TTL]) 3307 conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); 3308 3309 if (data[IFLA_VXLAN_TTL_INHERIT]) { 3310 if (changelink) 3311 return -EOPNOTSUPP; 3312 conf->flags |= VXLAN_F_TTL_INHERIT; 3313 } 3314 3315 if (data[IFLA_VXLAN_LABEL]) 3316 conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) & 3317 IPV6_FLOWLABEL_MASK; 3318 3319 if (data[IFLA_VXLAN_LEARNING]) { 3320 if (nla_get_u8(data[IFLA_VXLAN_LEARNING])) 3321 conf->flags |= VXLAN_F_LEARN; 3322 else 3323 conf->flags &= ~VXLAN_F_LEARN; 3324 } else if (!changelink) { 3325 /* default to learn on a new device */ 3326 conf->flags |= VXLAN_F_LEARN; 3327 } 3328 3329 if (data[IFLA_VXLAN_AGEING]) { 3330 if (changelink) 3331 return -EOPNOTSUPP; 3332 conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); 3333 } 3334 3335 if (data[IFLA_VXLAN_PROXY]) { 3336 if (changelink) 3337 return -EOPNOTSUPP; 3338 if (nla_get_u8(data[IFLA_VXLAN_PROXY])) 3339 conf->flags |= VXLAN_F_PROXY; 3340 } 3341 3342 if (data[IFLA_VXLAN_RSC]) { 3343 if (changelink) 3344 return -EOPNOTSUPP; 3345 if (nla_get_u8(data[IFLA_VXLAN_RSC])) 3346 conf->flags |= VXLAN_F_RSC; 3347 } 3348 3349 if (data[IFLA_VXLAN_L2MISS]) { 3350 if (changelink) 3351 return -EOPNOTSUPP; 3352 if (nla_get_u8(data[IFLA_VXLAN_L2MISS])) 3353 conf->flags |= VXLAN_F_L2MISS; 3354 } 3355 3356 if (data[IFLA_VXLAN_L3MISS]) { 3357 if (changelink) 3358 return -EOPNOTSUPP; 3359 if (nla_get_u8(data[IFLA_VXLAN_L3MISS])) 3360 conf->flags |= VXLAN_F_L3MISS; 3361 } 3362 3363 if (data[IFLA_VXLAN_LIMIT]) { 3364 if (changelink) 3365 return -EOPNOTSUPP; 3366 conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); 3367 } 3368 3369 if (data[IFLA_VXLAN_COLLECT_METADATA]) { 3370 if (changelink) 3371 return -EOPNOTSUPP; 3372 if (nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA])) 3373 conf->flags |= VXLAN_F_COLLECT_METADATA; 3374 } 3375 3376 if (data[IFLA_VXLAN_PORT_RANGE]) { 3377 if (!changelink) { 3378 const struct ifla_vxlan_port_range *p 3379 = nla_data(data[IFLA_VXLAN_PORT_RANGE]); 3380 conf->port_min = ntohs(p->low); 3381 conf->port_max = ntohs(p->high); 3382 } else { 3383 return -EOPNOTSUPP; 3384 } 3385 } 3386 3387 if (data[IFLA_VXLAN_PORT]) { 3388 if (changelink) 3389 return -EOPNOTSUPP; 3390 conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); 3391 } 3392 3393 if (data[IFLA_VXLAN_UDP_CSUM]) { 3394 if (changelink) 3395 return -EOPNOTSUPP; 3396 if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) 3397 conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; 3398 } 3399 3400 if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { 3401 if (changelink) 3402 return -EOPNOTSUPP; 3403 if (nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX])) 3404 conf->flags |= VXLAN_F_UDP_ZERO_CSUM6_TX; 3405 } 3406 3407 if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) { 3408 if (changelink) 3409 return -EOPNOTSUPP; 3410 if (nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX])) 3411 conf->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX; 3412 } 3413 3414 if (data[IFLA_VXLAN_REMCSUM_TX]) { 3415 if (changelink) 3416 return -EOPNOTSUPP; 3417 if (nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX])) 3418 conf->flags |= VXLAN_F_REMCSUM_TX; 3419 } 3420 3421 if (data[IFLA_VXLAN_REMCSUM_RX]) { 3422 if (changelink) 3423 return -EOPNOTSUPP; 3424 if (nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX])) 3425 conf->flags |= VXLAN_F_REMCSUM_RX; 3426 } 3427 3428 if (data[IFLA_VXLAN_GBP]) { 3429 if (changelink) 3430 return -EOPNOTSUPP; 3431 conf->flags |= VXLAN_F_GBP; 3432 } 3433 3434 if (data[IFLA_VXLAN_GPE]) { 3435 if (changelink) 3436 return -EOPNOTSUPP; 3437 conf->flags |= VXLAN_F_GPE; 3438 } 3439 3440 if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { 3441 if (changelink) 3442 return -EOPNOTSUPP; 3443 conf->flags |= VXLAN_F_REMCSUM_NOPARTIAL; 3444 } 3445 3446 if (tb[IFLA_MTU]) { 3447 if (changelink) 3448 return -EOPNOTSUPP; 3449 conf->mtu = nla_get_u32(tb[IFLA_MTU]); 3450 } 3451 3452 return 0; 3453} 3454 3455static int vxlan_newlink(struct net *src_net, struct net_device *dev, 3456 struct nlattr *tb[], struct nlattr *data[], 3457 struct netlink_ext_ack *extack) 3458{ 3459 struct vxlan_config conf; 3460 int err; 3461 3462 err = vxlan_nl2conf(tb, data, dev, &conf, false); 3463 if (err) 3464 return err; 3465 3466 return __vxlan_dev_create(src_net, dev, &conf, extack); 3467} 3468 3469static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], 3470 struct nlattr *data[], 3471 struct netlink_ext_ack *extack) 3472{ 3473 struct vxlan_dev *vxlan = netdev_priv(dev); 3474 struct vxlan_rdst *dst = &vxlan->default_dst; 3475 struct vxlan_rdst old_dst; 3476 struct vxlan_config conf; 3477 struct vxlan_fdb *f = NULL; 3478 int err; 3479 3480 err = vxlan_nl2conf(tb, data, 3481 dev, &conf, true); 3482 if (err) 3483 return err; 3484 3485 memcpy(&old_dst, dst, sizeof(struct vxlan_rdst)); 3486 3487 err = vxlan_dev_configure(vxlan->net, dev, &conf, true, extack); 3488 if (err) 3489 return err; 3490 3491 /* handle default dst entry */ 3492 if (!vxlan_addr_equal(&dst->remote_ip, &old_dst.remote_ip)) { 3493 spin_lock_bh(&vxlan->hash_lock); 3494 if (!vxlan_addr_any(&old_dst.remote_ip)) 3495 __vxlan_fdb_delete(vxlan, all_zeros_mac, 3496 old_dst.remote_ip, 3497 vxlan->cfg.dst_port, 3498 old_dst.remote_vni, 3499 old_dst.remote_vni, 3500 old_dst.remote_ifindex, 0); 3501 3502 if (!vxlan_addr_any(&dst->remote_ip)) { 3503 err = vxlan_fdb_create(vxlan, all_zeros_mac, 3504 &dst->remote_ip, 3505 NUD_REACHABLE | NUD_PERMANENT, 3506 vxlan->cfg.dst_port, 3507 dst->remote_vni, 3508 dst->remote_vni, 3509 dst->remote_ifindex, 3510 NTF_SELF, &f); 3511 if (err) { 3512 spin_unlock_bh(&vxlan->hash_lock); 3513 return err; 3514 } 3515 vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH); 3516 } 3517 spin_unlock_bh(&vxlan->hash_lock); 3518 } 3519 3520 return 0; 3521} 3522 3523static void vxlan_dellink(struct net_device *dev, struct list_head *head) 3524{ 3525 struct vxlan_dev *vxlan = netdev_priv(dev); 3526 3527 vxlan_flush(vxlan, true); 3528 3529 gro_cells_destroy(&vxlan->gro_cells); 3530 list_del(&vxlan->next); 3531 unregister_netdevice_queue(dev, head); 3532} 3533 3534static size_t vxlan_get_size(const struct net_device *dev) 3535{ 3536 3537 return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ 3538 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ 3539 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ 3540 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ 3541 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ 3542 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ 3543 nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ 3544 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ 3545 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ 3546 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ 3547 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ 3548 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ 3549 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ 3550 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ 3551 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ 3552 nla_total_size(sizeof(struct ifla_vxlan_port_range)) + 3553 nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ 3554 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ 3555 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ 3556 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ 3557 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ 3558 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ 3559 0; 3560} 3561 3562static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) 3563{ 3564 const struct vxlan_dev *vxlan = netdev_priv(dev); 3565 const struct vxlan_rdst *dst = &vxlan->default_dst; 3566 struct ifla_vxlan_port_range ports = { 3567 .low = htons(vxlan->cfg.port_min), 3568 .high = htons(vxlan->cfg.port_max), 3569 }; 3570 3571 if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni))) 3572 goto nla_put_failure; 3573 3574 if (!vxlan_addr_any(&dst->remote_ip)) { 3575 if (dst->remote_ip.sa.sa_family == AF_INET) { 3576 if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP, 3577 dst->remote_ip.sin.sin_addr.s_addr)) 3578 goto nla_put_failure; 3579#if IS_ENABLED(CONFIG_IPV6) 3580 } else { 3581 if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6, 3582 &dst->remote_ip.sin6.sin6_addr)) 3583 goto nla_put_failure; 3584#endif 3585 } 3586 } 3587 3588 if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) 3589 goto nla_put_failure; 3590 3591 if (!vxlan_addr_any(&vxlan->cfg.saddr)) { 3592 if (vxlan->cfg.saddr.sa.sa_family == AF_INET) { 3593 if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL, 3594 vxlan->cfg.saddr.sin.sin_addr.s_addr)) 3595 goto nla_put_failure; 3596#if IS_ENABLED(CONFIG_IPV6) 3597 } else { 3598 if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6, 3599 &vxlan->cfg.saddr.sin6.sin6_addr)) 3600 goto nla_put_failure; 3601#endif 3602 } 3603 } 3604 3605 if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || 3606 nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || 3607 nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || 3608 nla_put_u8(skb, IFLA_VXLAN_LEARNING, 3609 !!(vxlan->cfg.flags & VXLAN_F_LEARN)) || 3610 nla_put_u8(skb, IFLA_VXLAN_PROXY, 3611 !!(vxlan->cfg.flags & VXLAN_F_PROXY)) || 3612 nla_put_u8(skb, IFLA_VXLAN_RSC, 3613 !!(vxlan->cfg.flags & VXLAN_F_RSC)) || 3614 nla_put_u8(skb, IFLA_VXLAN_L2MISS, 3615 !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) || 3616 nla_put_u8(skb, IFLA_VXLAN_L3MISS, 3617 !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) || 3618 nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA, 3619 !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) || 3620 nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || 3621 nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || 3622 nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || 3623 nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, 3624 !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) || 3625 nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, 3626 !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || 3627 nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 3628 !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || 3629 nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, 3630 !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || 3631 nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, 3632 !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX))) 3633 goto nla_put_failure; 3634 3635 if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) 3636 goto nla_put_failure; 3637 3638 if (vxlan->cfg.flags & VXLAN_F_GBP && 3639 nla_put_flag(skb, IFLA_VXLAN_GBP)) 3640 goto nla_put_failure; 3641 3642 if (vxlan->cfg.flags & VXLAN_F_GPE && 3643 nla_put_flag(skb, IFLA_VXLAN_GPE)) 3644 goto nla_put_failure; 3645 3646 if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL && 3647 nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) 3648 goto nla_put_failure; 3649 3650 return 0; 3651 3652nla_put_failure: 3653 return -EMSGSIZE; 3654} 3655 3656static struct net *vxlan_get_link_net(const struct net_device *dev) 3657{ 3658 struct vxlan_dev *vxlan = netdev_priv(dev); 3659 3660 return vxlan->net; 3661} 3662 3663static struct rtnl_link_ops vxlan_link_ops __read_mostly = { 3664 .kind = "vxlan", 3665 .maxtype = IFLA_VXLAN_MAX, 3666 .policy = vxlan_policy, 3667 .priv_size = sizeof(struct vxlan_dev), 3668 .setup = vxlan_setup, 3669 .validate = vxlan_validate, 3670 .newlink = vxlan_newlink, 3671 .changelink = vxlan_changelink, 3672 .dellink = vxlan_dellink, 3673 .get_size = vxlan_get_size, 3674 .fill_info = vxlan_fill_info, 3675 .get_link_net = vxlan_get_link_net, 3676}; 3677 3678struct net_device *vxlan_dev_create(struct net *net, const char *name, 3679 u8 name_assign_type, 3680 struct vxlan_config *conf) 3681{ 3682 struct nlattr *tb[IFLA_MAX + 1]; 3683 struct net_device *dev; 3684 int err; 3685 3686 memset(&tb, 0, sizeof(tb)); 3687 3688 dev = rtnl_create_link(net, name, name_assign_type, 3689 &vxlan_link_ops, tb); 3690 if (IS_ERR(dev)) 3691 return dev; 3692 3693 err = __vxlan_dev_create(net, dev, conf, NULL); 3694 if (err < 0) { 3695 free_netdev(dev); 3696 return ERR_PTR(err); 3697 } 3698 3699 err = rtnl_configure_link(dev, NULL); 3700 if (err < 0) { 3701 LIST_HEAD(list_kill); 3702 3703 vxlan_dellink(dev, &list_kill); 3704 unregister_netdevice_many(&list_kill); 3705 return ERR_PTR(err); 3706 } 3707 3708 return dev; 3709} 3710EXPORT_SYMBOL_GPL(vxlan_dev_create); 3711 3712static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, 3713 struct net_device *dev) 3714{ 3715 struct vxlan_dev *vxlan, *next; 3716 LIST_HEAD(list_kill); 3717 3718 list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { 3719 struct vxlan_rdst *dst = &vxlan->default_dst; 3720 3721 /* In case we created vxlan device with carrier 3722 * and we loose the carrier due to module unload 3723 * we also need to remove vxlan device. In other 3724 * cases, it's not necessary and remote_ifindex 3725 * is 0 here, so no matches. 3726 */ 3727 if (dst->remote_ifindex == dev->ifindex) 3728 vxlan_dellink(vxlan->dev, &list_kill); 3729 } 3730 3731 unregister_netdevice_many(&list_kill); 3732} 3733 3734static int vxlan_netdevice_event(struct notifier_block *unused, 3735 unsigned long event, void *ptr) 3736{ 3737 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 3738 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); 3739 3740 if (event == NETDEV_UNREGISTER) { 3741 vxlan_offload_rx_ports(dev, false); 3742 vxlan_handle_lowerdev_unregister(vn, dev); 3743 } else if (event == NETDEV_REGISTER) { 3744 vxlan_offload_rx_ports(dev, true); 3745 } else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO || 3746 event == NETDEV_UDP_TUNNEL_DROP_INFO) { 3747 vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO); 3748 } 3749 3750 return NOTIFY_DONE; 3751} 3752 3753static struct notifier_block vxlan_notifier_block __read_mostly = { 3754 .notifier_call = vxlan_netdevice_event, 3755}; 3756 3757static __net_init int vxlan_init_net(struct net *net) 3758{ 3759 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 3760 unsigned int h; 3761 3762 INIT_LIST_HEAD(&vn->vxlan_list); 3763 spin_lock_init(&vn->sock_lock); 3764 3765 for (h = 0; h < PORT_HASH_SIZE; ++h) 3766 INIT_HLIST_HEAD(&vn->sock_list[h]); 3767 3768 return 0; 3769} 3770 3771static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) 3772{ 3773 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 3774 struct vxlan_dev *vxlan, *next; 3775 struct net_device *dev, *aux; 3776 unsigned int h; 3777 3778 for_each_netdev_safe(net, dev, aux) 3779 if (dev->rtnl_link_ops == &vxlan_link_ops) 3780 unregister_netdevice_queue(dev, head); 3781 3782 list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { 3783 /* If vxlan->dev is in the same netns, it has already been added 3784 * to the list by the previous loop. 3785 */ 3786 if (!net_eq(dev_net(vxlan->dev), net)) { 3787 gro_cells_destroy(&vxlan->gro_cells); 3788 unregister_netdevice_queue(vxlan->dev, head); 3789 } 3790 } 3791 3792 for (h = 0; h < PORT_HASH_SIZE; ++h) 3793 WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); 3794} 3795 3796static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) 3797{ 3798 struct net *net; 3799 LIST_HEAD(list); 3800 3801 rtnl_lock(); 3802 list_for_each_entry(net, net_list, exit_list) 3803 vxlan_destroy_tunnels(net, &list); 3804 3805 unregister_netdevice_many(&list); 3806 rtnl_unlock(); 3807} 3808 3809static struct pernet_operations vxlan_net_ops = { 3810 .init = vxlan_init_net, 3811 .exit_batch = vxlan_exit_batch_net, 3812 .id = &vxlan_net_id, 3813 .size = sizeof(struct vxlan_net), 3814}; 3815 3816static int __init vxlan_init_module(void) 3817{ 3818 int rc; 3819 3820 get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); 3821 3822 rc = register_pernet_subsys(&vxlan_net_ops); 3823 if (rc) 3824 goto out1; 3825 3826 rc = register_netdevice_notifier(&vxlan_notifier_block); 3827 if (rc) 3828 goto out2; 3829 3830 rc = rtnl_link_register(&vxlan_link_ops); 3831 if (rc) 3832 goto out3; 3833 3834 return 0; 3835out3: 3836 unregister_netdevice_notifier(&vxlan_notifier_block); 3837out2: 3838 unregister_pernet_subsys(&vxlan_net_ops); 3839out1: 3840 return rc; 3841} 3842late_initcall(vxlan_init_module); 3843 3844static void __exit vxlan_cleanup_module(void) 3845{ 3846 rtnl_link_unregister(&vxlan_link_ops); 3847 unregister_netdevice_notifier(&vxlan_notifier_block); 3848 unregister_pernet_subsys(&vxlan_net_ops); 3849 /* rcu_barrier() is called by netns */ 3850} 3851module_exit(vxlan_cleanup_module); 3852 3853MODULE_LICENSE("GPL"); 3854MODULE_VERSION(VXLAN_VERSION); 3855MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>"); 3856MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic"); 3857MODULE_ALIAS_RTNL_LINK("vxlan");