Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v5.1-rc2 4416 lines 114 kB view raw
1/* 2 * VXLAN: Virtual eXtensible Local Area Network 3 * 4 * Copyright (c) 2012-2013 Vyatta Inc. 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12 13#include <linux/kernel.h> 14#include <linux/module.h> 15#include <linux/errno.h> 16#include <linux/slab.h> 17#include <linux/udp.h> 18#include <linux/igmp.h> 19#include <linux/if_ether.h> 20#include <linux/ethtool.h> 21#include <net/arp.h> 22#include <net/ndisc.h> 23#include <net/ip.h> 24#include <net/icmp.h> 25#include <net/rtnetlink.h> 26#include <net/inet_ecn.h> 27#include <net/net_namespace.h> 28#include <net/netns/generic.h> 29#include <net/tun_proto.h> 30#include <net/vxlan.h> 31 32#if IS_ENABLED(CONFIG_IPV6) 33#include <net/ip6_tunnel.h> 34#include <net/ip6_checksum.h> 35#endif 36 37#define VXLAN_VERSION "0.1" 38 39#define PORT_HASH_BITS 8 40#define PORT_HASH_SIZE (1<<PORT_HASH_BITS) 41#define FDB_AGE_DEFAULT 300 /* 5 min */ 42#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */ 43 44/* UDP port for VXLAN traffic. 45 * The IANA assigned port is 4789, but the Linux default is 8472 46 * for compatibility with early adopters. 47 */ 48static unsigned short vxlan_port __read_mostly = 8472; 49module_param_named(udp_port, vxlan_port, ushort, 0444); 50MODULE_PARM_DESC(udp_port, "Destination UDP port"); 51 52static bool log_ecn_error = true; 53module_param(log_ecn_error, bool, 0644); 54MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); 55 56static unsigned int vxlan_net_id; 57static struct rtnl_link_ops vxlan_link_ops; 58 59static const u8 all_zeros_mac[ETH_ALEN + 2]; 60 61static int vxlan_sock_add(struct vxlan_dev *vxlan); 62 63static void vxlan_vs_del_dev(struct vxlan_dev *vxlan); 64 65/* per-network namespace private data for this module */ 66struct vxlan_net { 67 struct list_head vxlan_list; 68 struct hlist_head sock_list[PORT_HASH_SIZE]; 69 spinlock_t sock_lock; 70}; 71 72/* Forwarding table entry */ 73struct vxlan_fdb { 74 struct hlist_node hlist; /* linked list of entries */ 75 struct rcu_head rcu; 76 unsigned long updated; /* jiffies */ 77 unsigned long used; 78 struct list_head remotes; 79 u8 eth_addr[ETH_ALEN]; 80 u16 state; /* see ndm_state */ 81 __be32 vni; 82 u16 flags; /* see ndm_flags and below */ 83}; 84 85#define NTF_VXLAN_ADDED_BY_USER 0x100 86 87/* salt for hash table */ 88static u32 vxlan_salt __read_mostly; 89 90static inline bool vxlan_collect_metadata(struct vxlan_sock *vs) 91{ 92 return vs->flags & VXLAN_F_COLLECT_METADATA || 93 ip_tunnel_collect_metadata(); 94} 95 96#if IS_ENABLED(CONFIG_IPV6) 97static inline 98bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) 99{ 100 if (a->sa.sa_family != b->sa.sa_family) 101 return false; 102 if (a->sa.sa_family == AF_INET6) 103 return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr); 104 else 105 return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; 106} 107 108static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) 109{ 110 if (nla_len(nla) >= sizeof(struct in6_addr)) { 111 ip->sin6.sin6_addr = nla_get_in6_addr(nla); 112 ip->sa.sa_family = AF_INET6; 113 return 0; 114 } else if (nla_len(nla) >= sizeof(__be32)) { 115 ip->sin.sin_addr.s_addr = nla_get_in_addr(nla); 116 ip->sa.sa_family = AF_INET; 117 return 0; 118 } else { 119 return -EAFNOSUPPORT; 120 } 121} 122 123static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, 124 const union vxlan_addr *ip) 125{ 126 if (ip->sa.sa_family == AF_INET6) 127 return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr); 128 else 129 return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr); 130} 131 132#else /* !CONFIG_IPV6 */ 133 134static inline 135bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b) 136{ 137 return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr; 138} 139 140static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla) 141{ 142 if (nla_len(nla) >= sizeof(struct in6_addr)) { 143 return -EAFNOSUPPORT; 144 } else if (nla_len(nla) >= sizeof(__be32)) { 145 ip->sin.sin_addr.s_addr = nla_get_in_addr(nla); 146 ip->sa.sa_family = AF_INET; 147 return 0; 148 } else { 149 return -EAFNOSUPPORT; 150 } 151} 152 153static int vxlan_nla_put_addr(struct sk_buff *skb, int attr, 154 const union vxlan_addr *ip) 155{ 156 return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr); 157} 158#endif 159 160/* Virtual Network hash table head */ 161static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni) 162{ 163 return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)]; 164} 165 166/* Socket hash table head */ 167static inline struct hlist_head *vs_head(struct net *net, __be16 port) 168{ 169 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 170 171 return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; 172} 173 174/* First remote destination for a forwarding entry. 175 * Guaranteed to be non-NULL because remotes are never deleted. 176 */ 177static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) 178{ 179 return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); 180} 181 182static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) 183{ 184 return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); 185} 186 187/* Find VXLAN socket based on network namespace, address family and UDP port 188 * and enabled unshareable flags. 189 */ 190static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family, 191 __be16 port, u32 flags, int ifindex) 192{ 193 struct vxlan_sock *vs; 194 195 flags &= VXLAN_F_RCV_FLAGS; 196 197 hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) { 198 if (inet_sk(vs->sock->sk)->inet_sport == port && 199 vxlan_get_sk_family(vs) == family && 200 vs->flags == flags && 201 vs->sock->sk->sk_bound_dev_if == ifindex) 202 return vs; 203 } 204 return NULL; 205} 206 207static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex, 208 __be32 vni) 209{ 210 struct vxlan_dev_node *node; 211 212 /* For flow based devices, map all packets to VNI 0 */ 213 if (vs->flags & VXLAN_F_COLLECT_METADATA) 214 vni = 0; 215 216 hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) { 217 if (node->vxlan->default_dst.remote_vni != vni) 218 continue; 219 220 if (IS_ENABLED(CONFIG_IPV6)) { 221 const struct vxlan_config *cfg = &node->vxlan->cfg; 222 223 if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) && 224 cfg->remote_ifindex != ifindex) 225 continue; 226 } 227 228 return node->vxlan; 229 } 230 231 return NULL; 232} 233 234/* Look up VNI in a per net namespace table */ 235static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex, 236 __be32 vni, sa_family_t family, 237 __be16 port, u32 flags) 238{ 239 struct vxlan_sock *vs; 240 241 vs = vxlan_find_sock(net, family, port, flags, ifindex); 242 if (!vs) 243 return NULL; 244 245 return vxlan_vs_find_vni(vs, ifindex, vni); 246} 247 248/* Fill in neighbour message in skbuff. */ 249static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, 250 const struct vxlan_fdb *fdb, 251 u32 portid, u32 seq, int type, unsigned int flags, 252 const struct vxlan_rdst *rdst) 253{ 254 unsigned long now = jiffies; 255 struct nda_cacheinfo ci; 256 struct nlmsghdr *nlh; 257 struct ndmsg *ndm; 258 bool send_ip, send_eth; 259 260 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); 261 if (nlh == NULL) 262 return -EMSGSIZE; 263 264 ndm = nlmsg_data(nlh); 265 memset(ndm, 0, sizeof(*ndm)); 266 267 send_eth = send_ip = true; 268 269 if (type == RTM_GETNEIGH) { 270 send_ip = !vxlan_addr_any(&rdst->remote_ip); 271 send_eth = !is_zero_ether_addr(fdb->eth_addr); 272 ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; 273 } else 274 ndm->ndm_family = AF_BRIDGE; 275 ndm->ndm_state = fdb->state; 276 ndm->ndm_ifindex = vxlan->dev->ifindex; 277 ndm->ndm_flags = fdb->flags; 278 if (rdst->offloaded) 279 ndm->ndm_flags |= NTF_OFFLOADED; 280 ndm->ndm_type = RTN_UNICAST; 281 282 if (!net_eq(dev_net(vxlan->dev), vxlan->net) && 283 nla_put_s32(skb, NDA_LINK_NETNSID, 284 peernet2id(dev_net(vxlan->dev), vxlan->net))) 285 goto nla_put_failure; 286 287 if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) 288 goto nla_put_failure; 289 290 if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) 291 goto nla_put_failure; 292 293 if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && 294 nla_put_be16(skb, NDA_PORT, rdst->remote_port)) 295 goto nla_put_failure; 296 if (rdst->remote_vni != vxlan->default_dst.remote_vni && 297 nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) 298 goto nla_put_failure; 299 if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && 300 nla_put_u32(skb, NDA_SRC_VNI, 301 be32_to_cpu(fdb->vni))) 302 goto nla_put_failure; 303 if (rdst->remote_ifindex && 304 nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) 305 goto nla_put_failure; 306 307 ci.ndm_used = jiffies_to_clock_t(now - fdb->used); 308 ci.ndm_confirmed = 0; 309 ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); 310 ci.ndm_refcnt = 0; 311 312 if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) 313 goto nla_put_failure; 314 315 nlmsg_end(skb, nlh); 316 return 0; 317 318nla_put_failure: 319 nlmsg_cancel(skb, nlh); 320 return -EMSGSIZE; 321} 322 323static inline size_t vxlan_nlmsg_size(void) 324{ 325 return NLMSG_ALIGN(sizeof(struct ndmsg)) 326 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */ 327 + nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */ 328 + nla_total_size(sizeof(__be16)) /* NDA_PORT */ 329 + nla_total_size(sizeof(__be32)) /* NDA_VNI */ 330 + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */ 331 + nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */ 332 + nla_total_size(sizeof(struct nda_cacheinfo)); 333} 334 335static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, 336 struct vxlan_rdst *rd, int type) 337{ 338 struct net *net = dev_net(vxlan->dev); 339 struct sk_buff *skb; 340 int err = -ENOBUFS; 341 342 skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC); 343 if (skb == NULL) 344 goto errout; 345 346 err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd); 347 if (err < 0) { 348 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */ 349 WARN_ON(err == -EMSGSIZE); 350 kfree_skb(skb); 351 goto errout; 352 } 353 354 rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); 355 return; 356errout: 357 if (err < 0) 358 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); 359} 360 361static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan, 362 const struct vxlan_fdb *fdb, 363 const struct vxlan_rdst *rd, 364 struct netlink_ext_ack *extack, 365 struct switchdev_notifier_vxlan_fdb_info *fdb_info) 366{ 367 fdb_info->info.dev = vxlan->dev; 368 fdb_info->info.extack = extack; 369 fdb_info->remote_ip = rd->remote_ip; 370 fdb_info->remote_port = rd->remote_port; 371 fdb_info->remote_vni = rd->remote_vni; 372 fdb_info->remote_ifindex = rd->remote_ifindex; 373 memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN); 374 fdb_info->vni = fdb->vni; 375 fdb_info->offloaded = rd->offloaded; 376 fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER; 377} 378 379static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan, 380 struct vxlan_fdb *fdb, 381 struct vxlan_rdst *rd, 382 bool adding, 383 struct netlink_ext_ack *extack) 384{ 385 struct switchdev_notifier_vxlan_fdb_info info; 386 enum switchdev_notifier_type notifier_type; 387 int ret; 388 389 if (WARN_ON(!rd)) 390 return 0; 391 392 notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE 393 : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE; 394 vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info); 395 ret = call_switchdev_notifiers(notifier_type, vxlan->dev, 396 &info.info, extack); 397 return notifier_to_errno(ret); 398} 399 400static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, 401 struct vxlan_rdst *rd, int type, bool swdev_notify, 402 struct netlink_ext_ack *extack) 403{ 404 int err; 405 406 if (swdev_notify) { 407 switch (type) { 408 case RTM_NEWNEIGH: 409 err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, 410 true, extack); 411 if (err) 412 return err; 413 break; 414 case RTM_DELNEIGH: 415 vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, 416 false, extack); 417 break; 418 } 419 } 420 421 __vxlan_fdb_notify(vxlan, fdb, rd, type); 422 return 0; 423} 424 425static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa) 426{ 427 struct vxlan_dev *vxlan = netdev_priv(dev); 428 struct vxlan_fdb f = { 429 .state = NUD_STALE, 430 }; 431 struct vxlan_rdst remote = { 432 .remote_ip = *ipa, /* goes to NDA_DST */ 433 .remote_vni = cpu_to_be32(VXLAN_N_VID), 434 }; 435 436 vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); 437} 438 439static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN]) 440{ 441 struct vxlan_fdb f = { 442 .state = NUD_STALE, 443 }; 444 struct vxlan_rdst remote = { }; 445 446 memcpy(f.eth_addr, eth_addr, ETH_ALEN); 447 448 vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL); 449} 450 451/* Hash Ethernet address */ 452static u32 eth_hash(const unsigned char *addr) 453{ 454 u64 value = get_unaligned((u64 *)addr); 455 456 /* only want 6 bytes */ 457#ifdef __BIG_ENDIAN 458 value >>= 16; 459#else 460 value <<= 16; 461#endif 462 return hash_64(value, FDB_HASH_BITS); 463} 464 465static u32 eth_vni_hash(const unsigned char *addr, __be32 vni) 466{ 467 /* use 1 byte of OUI and 3 bytes of NIC */ 468 u32 key = get_unaligned((u32 *)(addr + 2)); 469 470 return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1); 471} 472 473/* Hash chain to use given mac address */ 474static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan, 475 const u8 *mac, __be32 vni) 476{ 477 if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) 478 return &vxlan->fdb_head[eth_vni_hash(mac, vni)]; 479 else 480 return &vxlan->fdb_head[eth_hash(mac)]; 481} 482 483/* Look up Ethernet address in forwarding table */ 484static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan, 485 const u8 *mac, __be32 vni) 486{ 487 struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni); 488 struct vxlan_fdb *f; 489 490 hlist_for_each_entry_rcu(f, head, hlist) { 491 if (ether_addr_equal(mac, f->eth_addr)) { 492 if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { 493 if (vni == f->vni) 494 return f; 495 } else { 496 return f; 497 } 498 } 499 } 500 501 return NULL; 502} 503 504static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, 505 const u8 *mac, __be32 vni) 506{ 507 struct vxlan_fdb *f; 508 509 f = __vxlan_find_mac(vxlan, mac, vni); 510 if (f && f->used != jiffies) 511 f->used = jiffies; 512 513 return f; 514} 515 516/* caller should hold vxlan->hash_lock */ 517static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f, 518 union vxlan_addr *ip, __be16 port, 519 __be32 vni, __u32 ifindex) 520{ 521 struct vxlan_rdst *rd; 522 523 list_for_each_entry(rd, &f->remotes, list) { 524 if (vxlan_addr_equal(&rd->remote_ip, ip) && 525 rd->remote_port == port && 526 rd->remote_vni == vni && 527 rd->remote_ifindex == ifindex) 528 return rd; 529 } 530 531 return NULL; 532} 533 534int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni, 535 struct switchdev_notifier_vxlan_fdb_info *fdb_info) 536{ 537 struct vxlan_dev *vxlan = netdev_priv(dev); 538 u8 eth_addr[ETH_ALEN + 2] = { 0 }; 539 struct vxlan_rdst *rdst; 540 struct vxlan_fdb *f; 541 int rc = 0; 542 543 if (is_multicast_ether_addr(mac) || 544 is_zero_ether_addr(mac)) 545 return -EINVAL; 546 547 ether_addr_copy(eth_addr, mac); 548 549 rcu_read_lock(); 550 551 f = __vxlan_find_mac(vxlan, eth_addr, vni); 552 if (!f) { 553 rc = -ENOENT; 554 goto out; 555 } 556 557 rdst = first_remote_rcu(f); 558 vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info); 559 560out: 561 rcu_read_unlock(); 562 return rc; 563} 564EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc); 565 566static int vxlan_fdb_notify_one(struct notifier_block *nb, 567 const struct vxlan_dev *vxlan, 568 const struct vxlan_fdb *f, 569 const struct vxlan_rdst *rdst, 570 struct netlink_ext_ack *extack) 571{ 572 struct switchdev_notifier_vxlan_fdb_info fdb_info; 573 int rc; 574 575 vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info); 576 rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE, 577 &fdb_info); 578 return notifier_to_errno(rc); 579} 580 581int vxlan_fdb_replay(const struct net_device *dev, __be32 vni, 582 struct notifier_block *nb, 583 struct netlink_ext_ack *extack) 584{ 585 struct vxlan_dev *vxlan; 586 struct vxlan_rdst *rdst; 587 struct vxlan_fdb *f; 588 unsigned int h; 589 int rc = 0; 590 591 if (!netif_is_vxlan(dev)) 592 return -EINVAL; 593 vxlan = netdev_priv(dev); 594 595 spin_lock_bh(&vxlan->hash_lock); 596 for (h = 0; h < FDB_HASH_SIZE; ++h) { 597 hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) { 598 if (f->vni == vni) { 599 list_for_each_entry(rdst, &f->remotes, list) { 600 rc = vxlan_fdb_notify_one(nb, vxlan, 601 f, rdst, 602 extack); 603 if (rc) 604 goto out; 605 } 606 } 607 } 608 } 609 610out: 611 spin_unlock_bh(&vxlan->hash_lock); 612 return rc; 613} 614EXPORT_SYMBOL_GPL(vxlan_fdb_replay); 615 616void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni) 617{ 618 struct vxlan_dev *vxlan; 619 struct vxlan_rdst *rdst; 620 struct vxlan_fdb *f; 621 unsigned int h; 622 623 if (!netif_is_vxlan(dev)) 624 return; 625 vxlan = netdev_priv(dev); 626 627 spin_lock_bh(&vxlan->hash_lock); 628 for (h = 0; h < FDB_HASH_SIZE; ++h) { 629 hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) 630 if (f->vni == vni) 631 list_for_each_entry(rdst, &f->remotes, list) 632 rdst->offloaded = false; 633 } 634 spin_unlock_bh(&vxlan->hash_lock); 635} 636EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload); 637 638/* Replace destination of unicast mac */ 639static int vxlan_fdb_replace(struct vxlan_fdb *f, 640 union vxlan_addr *ip, __be16 port, __be32 vni, 641 __u32 ifindex, struct vxlan_rdst *oldrd) 642{ 643 struct vxlan_rdst *rd; 644 645 rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); 646 if (rd) 647 return 0; 648 649 rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list); 650 if (!rd) 651 return 0; 652 653 *oldrd = *rd; 654 dst_cache_reset(&rd->dst_cache); 655 rd->remote_ip = *ip; 656 rd->remote_port = port; 657 rd->remote_vni = vni; 658 rd->remote_ifindex = ifindex; 659 rd->offloaded = false; 660 return 1; 661} 662 663/* Add/update destinations for multicast */ 664static int vxlan_fdb_append(struct vxlan_fdb *f, 665 union vxlan_addr *ip, __be16 port, __be32 vni, 666 __u32 ifindex, struct vxlan_rdst **rdp) 667{ 668 struct vxlan_rdst *rd; 669 670 rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex); 671 if (rd) 672 return 0; 673 674 rd = kmalloc(sizeof(*rd), GFP_ATOMIC); 675 if (rd == NULL) 676 return -ENOBUFS; 677 678 if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) { 679 kfree(rd); 680 return -ENOBUFS; 681 } 682 683 rd->remote_ip = *ip; 684 rd->remote_port = port; 685 rd->offloaded = false; 686 rd->remote_vni = vni; 687 rd->remote_ifindex = ifindex; 688 689 list_add_tail_rcu(&rd->list, &f->remotes); 690 691 *rdp = rd; 692 return 1; 693} 694 695static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb, 696 unsigned int off, 697 struct vxlanhdr *vh, size_t hdrlen, 698 __be32 vni_field, 699 struct gro_remcsum *grc, 700 bool nopartial) 701{ 702 size_t start, offset; 703 704 if (skb->remcsum_offload) 705 return vh; 706 707 if (!NAPI_GRO_CB(skb)->csum_valid) 708 return NULL; 709 710 start = vxlan_rco_start(vni_field); 711 offset = start + vxlan_rco_offset(vni_field); 712 713 vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen, 714 start, offset, grc, nopartial); 715 716 skb->remcsum_offload = 1; 717 718 return vh; 719} 720 721static struct sk_buff *vxlan_gro_receive(struct sock *sk, 722 struct list_head *head, 723 struct sk_buff *skb) 724{ 725 struct sk_buff *pp = NULL; 726 struct sk_buff *p; 727 struct vxlanhdr *vh, *vh2; 728 unsigned int hlen, off_vx; 729 int flush = 1; 730 struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk); 731 __be32 flags; 732 struct gro_remcsum grc; 733 734 skb_gro_remcsum_init(&grc); 735 736 off_vx = skb_gro_offset(skb); 737 hlen = off_vx + sizeof(*vh); 738 vh = skb_gro_header_fast(skb, off_vx); 739 if (skb_gro_header_hard(skb, hlen)) { 740 vh = skb_gro_header_slow(skb, hlen, off_vx); 741 if (unlikely(!vh)) 742 goto out; 743 } 744 745 skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr)); 746 747 flags = vh->vx_flags; 748 749 if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) { 750 vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr), 751 vh->vx_vni, &grc, 752 !!(vs->flags & 753 VXLAN_F_REMCSUM_NOPARTIAL)); 754 755 if (!vh) 756 goto out; 757 } 758 759 skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */ 760 761 list_for_each_entry(p, head, list) { 762 if (!NAPI_GRO_CB(p)->same_flow) 763 continue; 764 765 vh2 = (struct vxlanhdr *)(p->data + off_vx); 766 if (vh->vx_flags != vh2->vx_flags || 767 vh->vx_vni != vh2->vx_vni) { 768 NAPI_GRO_CB(p)->same_flow = 0; 769 continue; 770 } 771 } 772 773 pp = call_gro_receive(eth_gro_receive, head, skb); 774 flush = 0; 775 776out: 777 skb_gro_flush_final_remcsum(skb, pp, flush, &grc); 778 779 return pp; 780} 781 782static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) 783{ 784 /* Sets 'skb->inner_mac_header' since we are always called with 785 * 'skb->encapsulation' set. 786 */ 787 return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); 788} 789 790static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, 791 const u8 *mac, __u16 state, 792 __be32 src_vni, __u16 ndm_flags) 793{ 794 struct vxlan_fdb *f; 795 796 f = kmalloc(sizeof(*f), GFP_ATOMIC); 797 if (!f) 798 return NULL; 799 f->state = state; 800 f->flags = ndm_flags; 801 f->updated = f->used = jiffies; 802 f->vni = src_vni; 803 INIT_LIST_HEAD(&f->remotes); 804 memcpy(f->eth_addr, mac, ETH_ALEN); 805 806 return f; 807} 808 809static int vxlan_fdb_create(struct vxlan_dev *vxlan, 810 const u8 *mac, union vxlan_addr *ip, 811 __u16 state, __be16 port, __be32 src_vni, 812 __be32 vni, __u32 ifindex, __u16 ndm_flags, 813 struct vxlan_fdb **fdb) 814{ 815 struct vxlan_rdst *rd = NULL; 816 struct vxlan_fdb *f; 817 int rc; 818 819 if (vxlan->cfg.addrmax && 820 vxlan->addrcnt >= vxlan->cfg.addrmax) 821 return -ENOSPC; 822 823 netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); 824 f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags); 825 if (!f) 826 return -ENOMEM; 827 828 rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); 829 if (rc < 0) { 830 kfree(f); 831 return rc; 832 } 833 834 ++vxlan->addrcnt; 835 hlist_add_head_rcu(&f->hlist, 836 vxlan_fdb_head(vxlan, mac, src_vni)); 837 838 *fdb = f; 839 840 return 0; 841} 842 843static void vxlan_fdb_free(struct rcu_head *head) 844{ 845 struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu); 846 struct vxlan_rdst *rd, *nd; 847 848 list_for_each_entry_safe(rd, nd, &f->remotes, list) { 849 dst_cache_destroy(&rd->dst_cache); 850 kfree(rd); 851 } 852 kfree(f); 853} 854 855static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, 856 bool do_notify, bool swdev_notify) 857{ 858 struct vxlan_rdst *rd; 859 860 netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); 861 862 --vxlan->addrcnt; 863 if (do_notify) 864 list_for_each_entry(rd, &f->remotes, list) 865 vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, 866 swdev_notify, NULL); 867 868 hlist_del_rcu(&f->hlist); 869 call_rcu(&f->rcu, vxlan_fdb_free); 870} 871 872static void vxlan_dst_free(struct rcu_head *head) 873{ 874 struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu); 875 876 dst_cache_destroy(&rd->dst_cache); 877 kfree(rd); 878} 879 880static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, 881 union vxlan_addr *ip, 882 __u16 state, __u16 flags, 883 __be16 port, __be32 vni, 884 __u32 ifindex, __u16 ndm_flags, 885 struct vxlan_fdb *f, 886 bool swdev_notify, 887 struct netlink_ext_ack *extack) 888{ 889 __u16 fdb_flags = (ndm_flags & ~NTF_USE); 890 struct vxlan_rdst *rd = NULL; 891 struct vxlan_rdst oldrd; 892 int notify = 0; 893 int rc = 0; 894 int err; 895 896 /* Do not allow an externally learned entry to take over an entry added 897 * by the user. 898 */ 899 if (!(fdb_flags & NTF_EXT_LEARNED) || 900 !(f->flags & NTF_VXLAN_ADDED_BY_USER)) { 901 if (f->state != state) { 902 f->state = state; 903 f->updated = jiffies; 904 notify = 1; 905 } 906 if (f->flags != fdb_flags) { 907 f->flags = fdb_flags; 908 f->updated = jiffies; 909 notify = 1; 910 } 911 } 912 913 if ((flags & NLM_F_REPLACE)) { 914 /* Only change unicasts */ 915 if (!(is_multicast_ether_addr(f->eth_addr) || 916 is_zero_ether_addr(f->eth_addr))) { 917 rc = vxlan_fdb_replace(f, ip, port, vni, 918 ifindex, &oldrd); 919 notify |= rc; 920 } else { 921 return -EOPNOTSUPP; 922 } 923 } 924 if ((flags & NLM_F_APPEND) && 925 (is_multicast_ether_addr(f->eth_addr) || 926 is_zero_ether_addr(f->eth_addr))) { 927 rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); 928 929 if (rc < 0) 930 return rc; 931 notify |= rc; 932 } 933 934 if (ndm_flags & NTF_USE) 935 f->used = jiffies; 936 937 if (notify) { 938 if (rd == NULL) 939 rd = first_remote_rtnl(f); 940 941 err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH, 942 swdev_notify, extack); 943 if (err) 944 goto err_notify; 945 } 946 947 return 0; 948 949err_notify: 950 if ((flags & NLM_F_REPLACE) && rc) 951 *rd = oldrd; 952 else if ((flags & NLM_F_APPEND) && rc) { 953 list_del_rcu(&rd->list); 954 call_rcu(&rd->rcu, vxlan_dst_free); 955 } 956 return err; 957} 958 959static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, 960 const u8 *mac, union vxlan_addr *ip, 961 __u16 state, __u16 flags, 962 __be16 port, __be32 src_vni, __be32 vni, 963 __u32 ifindex, __u16 ndm_flags, 964 bool swdev_notify, 965 struct netlink_ext_ack *extack) 966{ 967 __u16 fdb_flags = (ndm_flags & ~NTF_USE); 968 struct vxlan_fdb *f; 969 int rc; 970 971 /* Disallow replace to add a multicast entry */ 972 if ((flags & NLM_F_REPLACE) && 973 (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) 974 return -EOPNOTSUPP; 975 976 netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); 977 rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, 978 vni, ifindex, fdb_flags, &f); 979 if (rc < 0) 980 return rc; 981 982 rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH, 983 swdev_notify, extack); 984 if (rc) 985 goto err_notify; 986 987 return 0; 988 989err_notify: 990 vxlan_fdb_destroy(vxlan, f, false, false); 991 return rc; 992} 993 994/* Add new entry to forwarding table -- assumes lock held */ 995static int vxlan_fdb_update(struct vxlan_dev *vxlan, 996 const u8 *mac, union vxlan_addr *ip, 997 __u16 state, __u16 flags, 998 __be16 port, __be32 src_vni, __be32 vni, 999 __u32 ifindex, __u16 ndm_flags, 1000 bool swdev_notify, 1001 struct netlink_ext_ack *extack) 1002{ 1003 struct vxlan_fdb *f; 1004 1005 f = __vxlan_find_mac(vxlan, mac, src_vni); 1006 if (f) { 1007 if (flags & NLM_F_EXCL) { 1008 netdev_dbg(vxlan->dev, 1009 "lost race to create %pM\n", mac); 1010 return -EEXIST; 1011 } 1012 1013 return vxlan_fdb_update_existing(vxlan, ip, state, flags, port, 1014 vni, ifindex, ndm_flags, f, 1015 swdev_notify, extack); 1016 } else { 1017 if (!(flags & NLM_F_CREATE)) 1018 return -ENOENT; 1019 1020 return vxlan_fdb_update_create(vxlan, mac, ip, state, flags, 1021 port, src_vni, vni, ifindex, 1022 ndm_flags, swdev_notify, extack); 1023 } 1024} 1025 1026static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, 1027 struct vxlan_rdst *rd, bool swdev_notify) 1028{ 1029 list_del_rcu(&rd->list); 1030 vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL); 1031 call_rcu(&rd->rcu, vxlan_dst_free); 1032} 1033 1034static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, 1035 union vxlan_addr *ip, __be16 *port, __be32 *src_vni, 1036 __be32 *vni, u32 *ifindex) 1037{ 1038 struct net *net = dev_net(vxlan->dev); 1039 int err; 1040 1041 if (tb[NDA_DST]) { 1042 err = vxlan_nla_get_addr(ip, tb[NDA_DST]); 1043 if (err) 1044 return err; 1045 } else { 1046 union vxlan_addr *remote = &vxlan->default_dst.remote_ip; 1047 if (remote->sa.sa_family == AF_INET) { 1048 ip->sin.sin_addr.s_addr = htonl(INADDR_ANY); 1049 ip->sa.sa_family = AF_INET; 1050#if IS_ENABLED(CONFIG_IPV6) 1051 } else { 1052 ip->sin6.sin6_addr = in6addr_any; 1053 ip->sa.sa_family = AF_INET6; 1054#endif 1055 } 1056 } 1057 1058 if (tb[NDA_PORT]) { 1059 if (nla_len(tb[NDA_PORT]) != sizeof(__be16)) 1060 return -EINVAL; 1061 *port = nla_get_be16(tb[NDA_PORT]); 1062 } else { 1063 *port = vxlan->cfg.dst_port; 1064 } 1065 1066 if (tb[NDA_VNI]) { 1067 if (nla_len(tb[NDA_VNI]) != sizeof(u32)) 1068 return -EINVAL; 1069 *vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); 1070 } else { 1071 *vni = vxlan->default_dst.remote_vni; 1072 } 1073 1074 if (tb[NDA_SRC_VNI]) { 1075 if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32)) 1076 return -EINVAL; 1077 *src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI])); 1078 } else { 1079 *src_vni = vxlan->default_dst.remote_vni; 1080 } 1081 1082 if (tb[NDA_IFINDEX]) { 1083 struct net_device *tdev; 1084 1085 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32)) 1086 return -EINVAL; 1087 *ifindex = nla_get_u32(tb[NDA_IFINDEX]); 1088 tdev = __dev_get_by_index(net, *ifindex); 1089 if (!tdev) 1090 return -EADDRNOTAVAIL; 1091 } else { 1092 *ifindex = 0; 1093 } 1094 1095 return 0; 1096} 1097 1098/* Add static entry (via netlink) */ 1099static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], 1100 struct net_device *dev, 1101 const unsigned char *addr, u16 vid, u16 flags, 1102 struct netlink_ext_ack *extack) 1103{ 1104 struct vxlan_dev *vxlan = netdev_priv(dev); 1105 /* struct net *net = dev_net(vxlan->dev); */ 1106 union vxlan_addr ip; 1107 __be16 port; 1108 __be32 src_vni, vni; 1109 u32 ifindex; 1110 int err; 1111 1112 if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) { 1113 pr_info("RTM_NEWNEIGH with invalid state %#x\n", 1114 ndm->ndm_state); 1115 return -EINVAL; 1116 } 1117 1118 if (tb[NDA_DST] == NULL) 1119 return -EINVAL; 1120 1121 err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); 1122 if (err) 1123 return err; 1124 1125 if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) 1126 return -EAFNOSUPPORT; 1127 1128 spin_lock_bh(&vxlan->hash_lock); 1129 err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, 1130 port, src_vni, vni, ifindex, 1131 ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, 1132 true, extack); 1133 spin_unlock_bh(&vxlan->hash_lock); 1134 1135 return err; 1136} 1137 1138static int __vxlan_fdb_delete(struct vxlan_dev *vxlan, 1139 const unsigned char *addr, union vxlan_addr ip, 1140 __be16 port, __be32 src_vni, __be32 vni, 1141 u32 ifindex, bool swdev_notify) 1142{ 1143 struct vxlan_fdb *f; 1144 struct vxlan_rdst *rd = NULL; 1145 int err = -ENOENT; 1146 1147 f = vxlan_find_mac(vxlan, addr, src_vni); 1148 if (!f) 1149 return err; 1150 1151 if (!vxlan_addr_any(&ip)) { 1152 rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex); 1153 if (!rd) 1154 goto out; 1155 } 1156 1157 /* remove a destination if it's not the only one on the list, 1158 * otherwise destroy the fdb entry 1159 */ 1160 if (rd && !list_is_singular(&f->remotes)) { 1161 vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify); 1162 goto out; 1163 } 1164 1165 vxlan_fdb_destroy(vxlan, f, true, swdev_notify); 1166 1167out: 1168 return 0; 1169} 1170 1171/* Delete entry (via netlink) */ 1172static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], 1173 struct net_device *dev, 1174 const unsigned char *addr, u16 vid) 1175{ 1176 struct vxlan_dev *vxlan = netdev_priv(dev); 1177 union vxlan_addr ip; 1178 __be32 src_vni, vni; 1179 __be16 port; 1180 u32 ifindex; 1181 int err; 1182 1183 err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); 1184 if (err) 1185 return err; 1186 1187 spin_lock_bh(&vxlan->hash_lock); 1188 err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex, 1189 true); 1190 spin_unlock_bh(&vxlan->hash_lock); 1191 1192 return err; 1193} 1194 1195/* Dump forwarding table */ 1196static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, 1197 struct net_device *dev, 1198 struct net_device *filter_dev, int *idx) 1199{ 1200 struct vxlan_dev *vxlan = netdev_priv(dev); 1201 unsigned int h; 1202 int err = 0; 1203 1204 for (h = 0; h < FDB_HASH_SIZE; ++h) { 1205 struct vxlan_fdb *f; 1206 1207 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { 1208 struct vxlan_rdst *rd; 1209 1210 list_for_each_entry_rcu(rd, &f->remotes, list) { 1211 if (*idx < cb->args[2]) 1212 goto skip; 1213 1214 err = vxlan_fdb_info(skb, vxlan, f, 1215 NETLINK_CB(cb->skb).portid, 1216 cb->nlh->nlmsg_seq, 1217 RTM_NEWNEIGH, 1218 NLM_F_MULTI, rd); 1219 if (err < 0) 1220 goto out; 1221skip: 1222 *idx += 1; 1223 } 1224 } 1225 } 1226out: 1227 return err; 1228} 1229 1230static int vxlan_fdb_get(struct sk_buff *skb, 1231 struct nlattr *tb[], 1232 struct net_device *dev, 1233 const unsigned char *addr, 1234 u16 vid, u32 portid, u32 seq, 1235 struct netlink_ext_ack *extack) 1236{ 1237 struct vxlan_dev *vxlan = netdev_priv(dev); 1238 struct vxlan_fdb *f; 1239 __be32 vni; 1240 int err; 1241 1242 if (tb[NDA_VNI]) 1243 vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI])); 1244 else 1245 vni = vxlan->default_dst.remote_vni; 1246 1247 rcu_read_lock(); 1248 1249 f = __vxlan_find_mac(vxlan, addr, vni); 1250 if (!f) { 1251 NL_SET_ERR_MSG(extack, "Fdb entry not found"); 1252 err = -ENOENT; 1253 goto errout; 1254 } 1255 1256 err = vxlan_fdb_info(skb, vxlan, f, portid, seq, 1257 RTM_NEWNEIGH, 0, first_remote_rcu(f)); 1258errout: 1259 rcu_read_unlock(); 1260 return err; 1261} 1262 1263/* Watch incoming packets to learn mapping between Ethernet address 1264 * and Tunnel endpoint. 1265 * Return true if packet is bogus and should be dropped. 1266 */ 1267static bool vxlan_snoop(struct net_device *dev, 1268 union vxlan_addr *src_ip, const u8 *src_mac, 1269 u32 src_ifindex, __be32 vni) 1270{ 1271 struct vxlan_dev *vxlan = netdev_priv(dev); 1272 struct vxlan_fdb *f; 1273 u32 ifindex = 0; 1274 1275#if IS_ENABLED(CONFIG_IPV6) 1276 if (src_ip->sa.sa_family == AF_INET6 && 1277 (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL)) 1278 ifindex = src_ifindex; 1279#endif 1280 1281 f = vxlan_find_mac(vxlan, src_mac, vni); 1282 if (likely(f)) { 1283 struct vxlan_rdst *rdst = first_remote_rcu(f); 1284 1285 if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) && 1286 rdst->remote_ifindex == ifindex)) 1287 return false; 1288 1289 /* Don't migrate static entries, drop packets */ 1290 if (f->state & (NUD_PERMANENT | NUD_NOARP)) 1291 return true; 1292 1293 if (net_ratelimit()) 1294 netdev_info(dev, 1295 "%pM migrated from %pIS to %pIS\n", 1296 src_mac, &rdst->remote_ip.sa, &src_ip->sa); 1297 1298 rdst->remote_ip = *src_ip; 1299 f->updated = jiffies; 1300 vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); 1301 } else { 1302 /* learned new entry */ 1303 spin_lock(&vxlan->hash_lock); 1304 1305 /* close off race between vxlan_flush and incoming packets */ 1306 if (netif_running(dev)) 1307 vxlan_fdb_update(vxlan, src_mac, src_ip, 1308 NUD_REACHABLE, 1309 NLM_F_EXCL|NLM_F_CREATE, 1310 vxlan->cfg.dst_port, 1311 vni, 1312 vxlan->default_dst.remote_vni, 1313 ifindex, NTF_SELF, true, NULL); 1314 spin_unlock(&vxlan->hash_lock); 1315 } 1316 1317 return false; 1318} 1319 1320/* See if multicast group is already in use by other ID */ 1321static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) 1322{ 1323 struct vxlan_dev *vxlan; 1324 struct vxlan_sock *sock4; 1325#if IS_ENABLED(CONFIG_IPV6) 1326 struct vxlan_sock *sock6; 1327#endif 1328 unsigned short family = dev->default_dst.remote_ip.sa.sa_family; 1329 1330 sock4 = rtnl_dereference(dev->vn4_sock); 1331 1332 /* The vxlan_sock is only used by dev, leaving group has 1333 * no effect on other vxlan devices. 1334 */ 1335 if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1) 1336 return false; 1337#if IS_ENABLED(CONFIG_IPV6) 1338 sock6 = rtnl_dereference(dev->vn6_sock); 1339 if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1) 1340 return false; 1341#endif 1342 1343 list_for_each_entry(vxlan, &vn->vxlan_list, next) { 1344 if (!netif_running(vxlan->dev) || vxlan == dev) 1345 continue; 1346 1347 if (family == AF_INET && 1348 rtnl_dereference(vxlan->vn4_sock) != sock4) 1349 continue; 1350#if IS_ENABLED(CONFIG_IPV6) 1351 if (family == AF_INET6 && 1352 rtnl_dereference(vxlan->vn6_sock) != sock6) 1353 continue; 1354#endif 1355 1356 if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip, 1357 &dev->default_dst.remote_ip)) 1358 continue; 1359 1360 if (vxlan->default_dst.remote_ifindex != 1361 dev->default_dst.remote_ifindex) 1362 continue; 1363 1364 return true; 1365 } 1366 1367 return false; 1368} 1369 1370static bool __vxlan_sock_release_prep(struct vxlan_sock *vs) 1371{ 1372 struct vxlan_net *vn; 1373 1374 if (!vs) 1375 return false; 1376 if (!refcount_dec_and_test(&vs->refcnt)) 1377 return false; 1378 1379 vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id); 1380 spin_lock(&vn->sock_lock); 1381 hlist_del_rcu(&vs->hlist); 1382 udp_tunnel_notify_del_rx_port(vs->sock, 1383 (vs->flags & VXLAN_F_GPE) ? 1384 UDP_TUNNEL_TYPE_VXLAN_GPE : 1385 UDP_TUNNEL_TYPE_VXLAN); 1386 spin_unlock(&vn->sock_lock); 1387 1388 return true; 1389} 1390 1391static void vxlan_sock_release(struct vxlan_dev *vxlan) 1392{ 1393 struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); 1394#if IS_ENABLED(CONFIG_IPV6) 1395 struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); 1396 1397 RCU_INIT_POINTER(vxlan->vn6_sock, NULL); 1398#endif 1399 1400 RCU_INIT_POINTER(vxlan->vn4_sock, NULL); 1401 synchronize_net(); 1402 1403 vxlan_vs_del_dev(vxlan); 1404 1405 if (__vxlan_sock_release_prep(sock4)) { 1406 udp_tunnel_sock_release(sock4->sock); 1407 kfree(sock4); 1408 } 1409 1410#if IS_ENABLED(CONFIG_IPV6) 1411 if (__vxlan_sock_release_prep(sock6)) { 1412 udp_tunnel_sock_release(sock6->sock); 1413 kfree(sock6); 1414 } 1415#endif 1416} 1417 1418/* Update multicast group membership when first VNI on 1419 * multicast address is brought up 1420 */ 1421static int vxlan_igmp_join(struct vxlan_dev *vxlan) 1422{ 1423 struct sock *sk; 1424 union vxlan_addr *ip = &vxlan->default_dst.remote_ip; 1425 int ifindex = vxlan->default_dst.remote_ifindex; 1426 int ret = -EINVAL; 1427 1428 if (ip->sa.sa_family == AF_INET) { 1429 struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); 1430 struct ip_mreqn mreq = { 1431 .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, 1432 .imr_ifindex = ifindex, 1433 }; 1434 1435 sk = sock4->sock->sk; 1436 lock_sock(sk); 1437 ret = ip_mc_join_group(sk, &mreq); 1438 release_sock(sk); 1439#if IS_ENABLED(CONFIG_IPV6) 1440 } else { 1441 struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); 1442 1443 sk = sock6->sock->sk; 1444 lock_sock(sk); 1445 ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex, 1446 &ip->sin6.sin6_addr); 1447 release_sock(sk); 1448#endif 1449 } 1450 1451 return ret; 1452} 1453 1454/* Inverse of vxlan_igmp_join when last VNI is brought down */ 1455static int vxlan_igmp_leave(struct vxlan_dev *vxlan) 1456{ 1457 struct sock *sk; 1458 union vxlan_addr *ip = &vxlan->default_dst.remote_ip; 1459 int ifindex = vxlan->default_dst.remote_ifindex; 1460 int ret = -EINVAL; 1461 1462 if (ip->sa.sa_family == AF_INET) { 1463 struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock); 1464 struct ip_mreqn mreq = { 1465 .imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr, 1466 .imr_ifindex = ifindex, 1467 }; 1468 1469 sk = sock4->sock->sk; 1470 lock_sock(sk); 1471 ret = ip_mc_leave_group(sk, &mreq); 1472 release_sock(sk); 1473#if IS_ENABLED(CONFIG_IPV6) 1474 } else { 1475 struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock); 1476 1477 sk = sock6->sock->sk; 1478 lock_sock(sk); 1479 ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex, 1480 &ip->sin6.sin6_addr); 1481 release_sock(sk); 1482#endif 1483 } 1484 1485 return ret; 1486} 1487 1488static bool vxlan_remcsum(struct vxlanhdr *unparsed, 1489 struct sk_buff *skb, u32 vxflags) 1490{ 1491 size_t start, offset; 1492 1493 if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload) 1494 goto out; 1495 1496 start = vxlan_rco_start(unparsed->vx_vni); 1497 offset = start + vxlan_rco_offset(unparsed->vx_vni); 1498 1499 if (!pskb_may_pull(skb, offset + sizeof(u16))) 1500 return false; 1501 1502 skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset, 1503 !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL)); 1504out: 1505 unparsed->vx_flags &= ~VXLAN_HF_RCO; 1506 unparsed->vx_vni &= VXLAN_VNI_MASK; 1507 return true; 1508} 1509 1510static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed, 1511 struct sk_buff *skb, u32 vxflags, 1512 struct vxlan_metadata *md) 1513{ 1514 struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed; 1515 struct metadata_dst *tun_dst; 1516 1517 if (!(unparsed->vx_flags & VXLAN_HF_GBP)) 1518 goto out; 1519 1520 md->gbp = ntohs(gbp->policy_id); 1521 1522 tun_dst = (struct metadata_dst *)skb_dst(skb); 1523 if (tun_dst) { 1524 tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT; 1525 tun_dst->u.tun_info.options_len = sizeof(*md); 1526 } 1527 if (gbp->dont_learn) 1528 md->gbp |= VXLAN_GBP_DONT_LEARN; 1529 1530 if (gbp->policy_applied) 1531 md->gbp |= VXLAN_GBP_POLICY_APPLIED; 1532 1533 /* In flow-based mode, GBP is carried in dst_metadata */ 1534 if (!(vxflags & VXLAN_F_COLLECT_METADATA)) 1535 skb->mark = md->gbp; 1536out: 1537 unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS; 1538} 1539 1540static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed, 1541 __be16 *protocol, 1542 struct sk_buff *skb, u32 vxflags) 1543{ 1544 struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed; 1545 1546 /* Need to have Next Protocol set for interfaces in GPE mode. */ 1547 if (!gpe->np_applied) 1548 return false; 1549 /* "The initial version is 0. If a receiver does not support the 1550 * version indicated it MUST drop the packet. 1551 */ 1552 if (gpe->version != 0) 1553 return false; 1554 /* "When the O bit is set to 1, the packet is an OAM packet and OAM 1555 * processing MUST occur." However, we don't implement OAM 1556 * processing, thus drop the packet. 1557 */ 1558 if (gpe->oam_flag) 1559 return false; 1560 1561 *protocol = tun_p_to_eth_p(gpe->next_protocol); 1562 if (!*protocol) 1563 return false; 1564 1565 unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS; 1566 return true; 1567} 1568 1569static bool vxlan_set_mac(struct vxlan_dev *vxlan, 1570 struct vxlan_sock *vs, 1571 struct sk_buff *skb, __be32 vni) 1572{ 1573 union vxlan_addr saddr; 1574 u32 ifindex = skb->dev->ifindex; 1575 1576 skb_reset_mac_header(skb); 1577 skb->protocol = eth_type_trans(skb, vxlan->dev); 1578 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); 1579 1580 /* Ignore packet loops (and multicast echo) */ 1581 if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr)) 1582 return false; 1583 1584 /* Get address from the outer IP header */ 1585 if (vxlan_get_sk_family(vs) == AF_INET) { 1586 saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr; 1587 saddr.sa.sa_family = AF_INET; 1588#if IS_ENABLED(CONFIG_IPV6) 1589 } else { 1590 saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr; 1591 saddr.sa.sa_family = AF_INET6; 1592#endif 1593 } 1594 1595 if ((vxlan->cfg.flags & VXLAN_F_LEARN) && 1596 vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni)) 1597 return false; 1598 1599 return true; 1600} 1601 1602static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph, 1603 struct sk_buff *skb) 1604{ 1605 int err = 0; 1606 1607 if (vxlan_get_sk_family(vs) == AF_INET) 1608 err = IP_ECN_decapsulate(oiph, skb); 1609#if IS_ENABLED(CONFIG_IPV6) 1610 else 1611 err = IP6_ECN_decapsulate(oiph, skb); 1612#endif 1613 1614 if (unlikely(err) && log_ecn_error) { 1615 if (vxlan_get_sk_family(vs) == AF_INET) 1616 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", 1617 &((struct iphdr *)oiph)->saddr, 1618 ((struct iphdr *)oiph)->tos); 1619 else 1620 net_info_ratelimited("non-ECT from %pI6\n", 1621 &((struct ipv6hdr *)oiph)->saddr); 1622 } 1623 return err <= 1; 1624} 1625 1626/* Callback from net/ipv4/udp.c to receive packets */ 1627static int vxlan_rcv(struct sock *sk, struct sk_buff *skb) 1628{ 1629 struct pcpu_sw_netstats *stats; 1630 struct vxlan_dev *vxlan; 1631 struct vxlan_sock *vs; 1632 struct vxlanhdr unparsed; 1633 struct vxlan_metadata _md; 1634 struct vxlan_metadata *md = &_md; 1635 __be16 protocol = htons(ETH_P_TEB); 1636 bool raw_proto = false; 1637 void *oiph; 1638 __be32 vni = 0; 1639 1640 /* Need UDP and VXLAN header to be present */ 1641 if (!pskb_may_pull(skb, VXLAN_HLEN)) 1642 goto drop; 1643 1644 unparsed = *vxlan_hdr(skb); 1645 /* VNI flag always required to be set */ 1646 if (!(unparsed.vx_flags & VXLAN_HF_VNI)) { 1647 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n", 1648 ntohl(vxlan_hdr(skb)->vx_flags), 1649 ntohl(vxlan_hdr(skb)->vx_vni)); 1650 /* Return non vxlan pkt */ 1651 goto drop; 1652 } 1653 unparsed.vx_flags &= ~VXLAN_HF_VNI; 1654 unparsed.vx_vni &= ~VXLAN_VNI_MASK; 1655 1656 vs = rcu_dereference_sk_user_data(sk); 1657 if (!vs) 1658 goto drop; 1659 1660 vni = vxlan_vni(vxlan_hdr(skb)->vx_vni); 1661 1662 vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni); 1663 if (!vxlan) 1664 goto drop; 1665 1666 /* For backwards compatibility, only allow reserved fields to be 1667 * used by VXLAN extensions if explicitly requested. 1668 */ 1669 if (vs->flags & VXLAN_F_GPE) { 1670 if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags)) 1671 goto drop; 1672 raw_proto = true; 1673 } 1674 1675 if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto, 1676 !net_eq(vxlan->net, dev_net(vxlan->dev)))) 1677 goto drop; 1678 1679 if (vxlan_collect_metadata(vs)) { 1680 struct metadata_dst *tun_dst; 1681 1682 tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY, 1683 key32_to_tunnel_id(vni), sizeof(*md)); 1684 1685 if (!tun_dst) 1686 goto drop; 1687 1688 md = ip_tunnel_info_opts(&tun_dst->u.tun_info); 1689 1690 skb_dst_set(skb, (struct dst_entry *)tun_dst); 1691 } else { 1692 memset(md, 0, sizeof(*md)); 1693 } 1694 1695 if (vs->flags & VXLAN_F_REMCSUM_RX) 1696 if (!vxlan_remcsum(&unparsed, skb, vs->flags)) 1697 goto drop; 1698 if (vs->flags & VXLAN_F_GBP) 1699 vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md); 1700 /* Note that GBP and GPE can never be active together. This is 1701 * ensured in vxlan_dev_configure. 1702 */ 1703 1704 if (unparsed.vx_flags || unparsed.vx_vni) { 1705 /* If there are any unprocessed flags remaining treat 1706 * this as a malformed packet. This behavior diverges from 1707 * VXLAN RFC (RFC7348) which stipulates that bits in reserved 1708 * in reserved fields are to be ignored. The approach here 1709 * maintains compatibility with previous stack code, and also 1710 * is more robust and provides a little more security in 1711 * adding extensions to VXLAN. 1712 */ 1713 goto drop; 1714 } 1715 1716 if (!raw_proto) { 1717 if (!vxlan_set_mac(vxlan, vs, skb, vni)) 1718 goto drop; 1719 } else { 1720 skb_reset_mac_header(skb); 1721 skb->dev = vxlan->dev; 1722 skb->pkt_type = PACKET_HOST; 1723 } 1724 1725 oiph = skb_network_header(skb); 1726 skb_reset_network_header(skb); 1727 1728 if (!vxlan_ecn_decapsulate(vs, oiph, skb)) { 1729 ++vxlan->dev->stats.rx_frame_errors; 1730 ++vxlan->dev->stats.rx_errors; 1731 goto drop; 1732 } 1733 1734 rcu_read_lock(); 1735 1736 if (unlikely(!(vxlan->dev->flags & IFF_UP))) { 1737 rcu_read_unlock(); 1738 atomic_long_inc(&vxlan->dev->rx_dropped); 1739 goto drop; 1740 } 1741 1742 stats = this_cpu_ptr(vxlan->dev->tstats); 1743 u64_stats_update_begin(&stats->syncp); 1744 stats->rx_packets++; 1745 stats->rx_bytes += skb->len; 1746 u64_stats_update_end(&stats->syncp); 1747 1748 gro_cells_receive(&vxlan->gro_cells, skb); 1749 1750 rcu_read_unlock(); 1751 1752 return 0; 1753 1754drop: 1755 /* Consume bad packet */ 1756 kfree_skb(skb); 1757 return 0; 1758} 1759 1760/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */ 1761static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb) 1762{ 1763 struct vxlan_dev *vxlan; 1764 struct vxlan_sock *vs; 1765 struct vxlanhdr *hdr; 1766 __be32 vni; 1767 1768 if (skb->len < VXLAN_HLEN) 1769 return -EINVAL; 1770 1771 hdr = vxlan_hdr(skb); 1772 1773 if (!(hdr->vx_flags & VXLAN_HF_VNI)) 1774 return -EINVAL; 1775 1776 vs = rcu_dereference_sk_user_data(sk); 1777 if (!vs) 1778 return -ENOENT; 1779 1780 vni = vxlan_vni(hdr->vx_vni); 1781 vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni); 1782 if (!vxlan) 1783 return -ENOENT; 1784 1785 return 0; 1786} 1787 1788static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) 1789{ 1790 struct vxlan_dev *vxlan = netdev_priv(dev); 1791 struct arphdr *parp; 1792 u8 *arpptr, *sha; 1793 __be32 sip, tip; 1794 struct neighbour *n; 1795 1796 if (dev->flags & IFF_NOARP) 1797 goto out; 1798 1799 if (!pskb_may_pull(skb, arp_hdr_len(dev))) { 1800 dev->stats.tx_dropped++; 1801 goto out; 1802 } 1803 parp = arp_hdr(skb); 1804 1805 if ((parp->ar_hrd != htons(ARPHRD_ETHER) && 1806 parp->ar_hrd != htons(ARPHRD_IEEE802)) || 1807 parp->ar_pro != htons(ETH_P_IP) || 1808 parp->ar_op != htons(ARPOP_REQUEST) || 1809 parp->ar_hln != dev->addr_len || 1810 parp->ar_pln != 4) 1811 goto out; 1812 arpptr = (u8 *)parp + sizeof(struct arphdr); 1813 sha = arpptr; 1814 arpptr += dev->addr_len; /* sha */ 1815 memcpy(&sip, arpptr, sizeof(sip)); 1816 arpptr += sizeof(sip); 1817 arpptr += dev->addr_len; /* tha */ 1818 memcpy(&tip, arpptr, sizeof(tip)); 1819 1820 if (ipv4_is_loopback(tip) || 1821 ipv4_is_multicast(tip)) 1822 goto out; 1823 1824 n = neigh_lookup(&arp_tbl, &tip, dev); 1825 1826 if (n) { 1827 struct vxlan_fdb *f; 1828 struct sk_buff *reply; 1829 1830 if (!(n->nud_state & NUD_CONNECTED)) { 1831 neigh_release(n); 1832 goto out; 1833 } 1834 1835 f = vxlan_find_mac(vxlan, n->ha, vni); 1836 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { 1837 /* bridge-local neighbor */ 1838 neigh_release(n); 1839 goto out; 1840 } 1841 1842 reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, 1843 n->ha, sha); 1844 1845 neigh_release(n); 1846 1847 if (reply == NULL) 1848 goto out; 1849 1850 skb_reset_mac_header(reply); 1851 __skb_pull(reply, skb_network_offset(reply)); 1852 reply->ip_summed = CHECKSUM_UNNECESSARY; 1853 reply->pkt_type = PACKET_HOST; 1854 1855 if (netif_rx_ni(reply) == NET_RX_DROP) 1856 dev->stats.rx_dropped++; 1857 } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { 1858 union vxlan_addr ipa = { 1859 .sin.sin_addr.s_addr = tip, 1860 .sin.sin_family = AF_INET, 1861 }; 1862 1863 vxlan_ip_miss(dev, &ipa); 1864 } 1865out: 1866 consume_skb(skb); 1867 return NETDEV_TX_OK; 1868} 1869 1870#if IS_ENABLED(CONFIG_IPV6) 1871static struct sk_buff *vxlan_na_create(struct sk_buff *request, 1872 struct neighbour *n, bool isrouter) 1873{ 1874 struct net_device *dev = request->dev; 1875 struct sk_buff *reply; 1876 struct nd_msg *ns, *na; 1877 struct ipv6hdr *pip6; 1878 u8 *daddr; 1879 int na_olen = 8; /* opt hdr + ETH_ALEN for target */ 1880 int ns_olen; 1881 int i, len; 1882 1883 if (dev == NULL || !pskb_may_pull(request, request->len)) 1884 return NULL; 1885 1886 len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) + 1887 sizeof(*na) + na_olen + dev->needed_tailroom; 1888 reply = alloc_skb(len, GFP_ATOMIC); 1889 if (reply == NULL) 1890 return NULL; 1891 1892 reply->protocol = htons(ETH_P_IPV6); 1893 reply->dev = dev; 1894 skb_reserve(reply, LL_RESERVED_SPACE(request->dev)); 1895 skb_push(reply, sizeof(struct ethhdr)); 1896 skb_reset_mac_header(reply); 1897 1898 ns = (struct nd_msg *)(ipv6_hdr(request) + 1); 1899 1900 daddr = eth_hdr(request)->h_source; 1901 ns_olen = request->len - skb_network_offset(request) - 1902 sizeof(struct ipv6hdr) - sizeof(*ns); 1903 for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) { 1904 if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) { 1905 daddr = ns->opt + i + sizeof(struct nd_opt_hdr); 1906 break; 1907 } 1908 } 1909 1910 /* Ethernet header */ 1911 ether_addr_copy(eth_hdr(reply)->h_dest, daddr); 1912 ether_addr_copy(eth_hdr(reply)->h_source, n->ha); 1913 eth_hdr(reply)->h_proto = htons(ETH_P_IPV6); 1914 reply->protocol = htons(ETH_P_IPV6); 1915 1916 skb_pull(reply, sizeof(struct ethhdr)); 1917 skb_reset_network_header(reply); 1918 skb_put(reply, sizeof(struct ipv6hdr)); 1919 1920 /* IPv6 header */ 1921 1922 pip6 = ipv6_hdr(reply); 1923 memset(pip6, 0, sizeof(struct ipv6hdr)); 1924 pip6->version = 6; 1925 pip6->priority = ipv6_hdr(request)->priority; 1926 pip6->nexthdr = IPPROTO_ICMPV6; 1927 pip6->hop_limit = 255; 1928 pip6->daddr = ipv6_hdr(request)->saddr; 1929 pip6->saddr = *(struct in6_addr *)n->primary_key; 1930 1931 skb_pull(reply, sizeof(struct ipv6hdr)); 1932 skb_reset_transport_header(reply); 1933 1934 /* Neighbor Advertisement */ 1935 na = skb_put_zero(reply, sizeof(*na) + na_olen); 1936 na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; 1937 na->icmph.icmp6_router = isrouter; 1938 na->icmph.icmp6_override = 1; 1939 na->icmph.icmp6_solicited = 1; 1940 na->target = ns->target; 1941 ether_addr_copy(&na->opt[2], n->ha); 1942 na->opt[0] = ND_OPT_TARGET_LL_ADDR; 1943 na->opt[1] = na_olen >> 3; 1944 1945 na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr, 1946 &pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6, 1947 csum_partial(na, sizeof(*na)+na_olen, 0)); 1948 1949 pip6->payload_len = htons(sizeof(*na)+na_olen); 1950 1951 skb_push(reply, sizeof(struct ipv6hdr)); 1952 1953 reply->ip_summed = CHECKSUM_UNNECESSARY; 1954 1955 return reply; 1956} 1957 1958static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni) 1959{ 1960 struct vxlan_dev *vxlan = netdev_priv(dev); 1961 const struct in6_addr *daddr; 1962 const struct ipv6hdr *iphdr; 1963 struct inet6_dev *in6_dev; 1964 struct neighbour *n; 1965 struct nd_msg *msg; 1966 1967 in6_dev = __in6_dev_get(dev); 1968 if (!in6_dev) 1969 goto out; 1970 1971 iphdr = ipv6_hdr(skb); 1972 daddr = &iphdr->daddr; 1973 msg = (struct nd_msg *)(iphdr + 1); 1974 1975 if (ipv6_addr_loopback(daddr) || 1976 ipv6_addr_is_multicast(&msg->target)) 1977 goto out; 1978 1979 n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev); 1980 1981 if (n) { 1982 struct vxlan_fdb *f; 1983 struct sk_buff *reply; 1984 1985 if (!(n->nud_state & NUD_CONNECTED)) { 1986 neigh_release(n); 1987 goto out; 1988 } 1989 1990 f = vxlan_find_mac(vxlan, n->ha, vni); 1991 if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) { 1992 /* bridge-local neighbor */ 1993 neigh_release(n); 1994 goto out; 1995 } 1996 1997 reply = vxlan_na_create(skb, n, 1998 !!(f ? f->flags & NTF_ROUTER : 0)); 1999 2000 neigh_release(n); 2001 2002 if (reply == NULL) 2003 goto out; 2004 2005 if (netif_rx_ni(reply) == NET_RX_DROP) 2006 dev->stats.rx_dropped++; 2007 2008 } else if (vxlan->cfg.flags & VXLAN_F_L3MISS) { 2009 union vxlan_addr ipa = { 2010 .sin6.sin6_addr = msg->target, 2011 .sin6.sin6_family = AF_INET6, 2012 }; 2013 2014 vxlan_ip_miss(dev, &ipa); 2015 } 2016 2017out: 2018 consume_skb(skb); 2019 return NETDEV_TX_OK; 2020} 2021#endif 2022 2023static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb) 2024{ 2025 struct vxlan_dev *vxlan = netdev_priv(dev); 2026 struct neighbour *n; 2027 2028 if (is_multicast_ether_addr(eth_hdr(skb)->h_dest)) 2029 return false; 2030 2031 n = NULL; 2032 switch (ntohs(eth_hdr(skb)->h_proto)) { 2033 case ETH_P_IP: 2034 { 2035 struct iphdr *pip; 2036 2037 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 2038 return false; 2039 pip = ip_hdr(skb); 2040 n = neigh_lookup(&arp_tbl, &pip->daddr, dev); 2041 if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { 2042 union vxlan_addr ipa = { 2043 .sin.sin_addr.s_addr = pip->daddr, 2044 .sin.sin_family = AF_INET, 2045 }; 2046 2047 vxlan_ip_miss(dev, &ipa); 2048 return false; 2049 } 2050 2051 break; 2052 } 2053#if IS_ENABLED(CONFIG_IPV6) 2054 case ETH_P_IPV6: 2055 { 2056 struct ipv6hdr *pip6; 2057 2058 if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) 2059 return false; 2060 pip6 = ipv6_hdr(skb); 2061 n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev); 2062 if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) { 2063 union vxlan_addr ipa = { 2064 .sin6.sin6_addr = pip6->daddr, 2065 .sin6.sin6_family = AF_INET6, 2066 }; 2067 2068 vxlan_ip_miss(dev, &ipa); 2069 return false; 2070 } 2071 2072 break; 2073 } 2074#endif 2075 default: 2076 return false; 2077 } 2078 2079 if (n) { 2080 bool diff; 2081 2082 diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha); 2083 if (diff) { 2084 memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, 2085 dev->addr_len); 2086 memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len); 2087 } 2088 neigh_release(n); 2089 return diff; 2090 } 2091 2092 return false; 2093} 2094 2095static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags, 2096 struct vxlan_metadata *md) 2097{ 2098 struct vxlanhdr_gbp *gbp; 2099 2100 if (!md->gbp) 2101 return; 2102 2103 gbp = (struct vxlanhdr_gbp *)vxh; 2104 vxh->vx_flags |= VXLAN_HF_GBP; 2105 2106 if (md->gbp & VXLAN_GBP_DONT_LEARN) 2107 gbp->dont_learn = 1; 2108 2109 if (md->gbp & VXLAN_GBP_POLICY_APPLIED) 2110 gbp->policy_applied = 1; 2111 2112 gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK); 2113} 2114 2115static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags, 2116 __be16 protocol) 2117{ 2118 struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh; 2119 2120 gpe->np_applied = 1; 2121 gpe->next_protocol = tun_p_from_eth_p(protocol); 2122 if (!gpe->next_protocol) 2123 return -EPFNOSUPPORT; 2124 return 0; 2125} 2126 2127static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst, 2128 int iphdr_len, __be32 vni, 2129 struct vxlan_metadata *md, u32 vxflags, 2130 bool udp_sum) 2131{ 2132 struct vxlanhdr *vxh; 2133 int min_headroom; 2134 int err; 2135 int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; 2136 __be16 inner_protocol = htons(ETH_P_TEB); 2137 2138 if ((vxflags & VXLAN_F_REMCSUM_TX) && 2139 skb->ip_summed == CHECKSUM_PARTIAL) { 2140 int csum_start = skb_checksum_start_offset(skb); 2141 2142 if (csum_start <= VXLAN_MAX_REMCSUM_START && 2143 !(csum_start & VXLAN_RCO_SHIFT_MASK) && 2144 (skb->csum_offset == offsetof(struct udphdr, check) || 2145 skb->csum_offset == offsetof(struct tcphdr, check))) 2146 type |= SKB_GSO_TUNNEL_REMCSUM; 2147 } 2148 2149 min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len 2150 + VXLAN_HLEN + iphdr_len; 2151 2152 /* Need space for new headers (invalidates iph ptr) */ 2153 err = skb_cow_head(skb, min_headroom); 2154 if (unlikely(err)) 2155 return err; 2156 2157 err = iptunnel_handle_offloads(skb, type); 2158 if (err) 2159 return err; 2160 2161 vxh = __skb_push(skb, sizeof(*vxh)); 2162 vxh->vx_flags = VXLAN_HF_VNI; 2163 vxh->vx_vni = vxlan_vni_field(vni); 2164 2165 if (type & SKB_GSO_TUNNEL_REMCSUM) { 2166 unsigned int start; 2167 2168 start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr); 2169 vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset); 2170 vxh->vx_flags |= VXLAN_HF_RCO; 2171 2172 if (!skb_is_gso(skb)) { 2173 skb->ip_summed = CHECKSUM_NONE; 2174 skb->encapsulation = 0; 2175 } 2176 } 2177 2178 if (vxflags & VXLAN_F_GBP) 2179 vxlan_build_gbp_hdr(vxh, vxflags, md); 2180 if (vxflags & VXLAN_F_GPE) { 2181 err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol); 2182 if (err < 0) 2183 return err; 2184 inner_protocol = skb->protocol; 2185 } 2186 2187 skb_set_inner_protocol(skb, inner_protocol); 2188 return 0; 2189} 2190 2191static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev, 2192 struct vxlan_sock *sock4, 2193 struct sk_buff *skb, int oif, u8 tos, 2194 __be32 daddr, __be32 *saddr, __be16 dport, __be16 sport, 2195 struct dst_cache *dst_cache, 2196 const struct ip_tunnel_info *info) 2197{ 2198 bool use_cache = ip_tunnel_dst_cache_usable(skb, info); 2199 struct rtable *rt = NULL; 2200 struct flowi4 fl4; 2201 2202 if (!sock4) 2203 return ERR_PTR(-EIO); 2204 2205 if (tos && !info) 2206 use_cache = false; 2207 if (use_cache) { 2208 rt = dst_cache_get_ip4(dst_cache, saddr); 2209 if (rt) 2210 return rt; 2211 } 2212 2213 memset(&fl4, 0, sizeof(fl4)); 2214 fl4.flowi4_oif = oif; 2215 fl4.flowi4_tos = RT_TOS(tos); 2216 fl4.flowi4_mark = skb->mark; 2217 fl4.flowi4_proto = IPPROTO_UDP; 2218 fl4.daddr = daddr; 2219 fl4.saddr = *saddr; 2220 fl4.fl4_dport = dport; 2221 fl4.fl4_sport = sport; 2222 2223 rt = ip_route_output_key(vxlan->net, &fl4); 2224 if (likely(!IS_ERR(rt))) { 2225 if (rt->dst.dev == dev) { 2226 netdev_dbg(dev, "circular route to %pI4\n", &daddr); 2227 ip_rt_put(rt); 2228 return ERR_PTR(-ELOOP); 2229 } 2230 2231 *saddr = fl4.saddr; 2232 if (use_cache) 2233 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); 2234 } else { 2235 netdev_dbg(dev, "no route to %pI4\n", &daddr); 2236 return ERR_PTR(-ENETUNREACH); 2237 } 2238 return rt; 2239} 2240 2241#if IS_ENABLED(CONFIG_IPV6) 2242static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, 2243 struct net_device *dev, 2244 struct vxlan_sock *sock6, 2245 struct sk_buff *skb, int oif, u8 tos, 2246 __be32 label, 2247 const struct in6_addr *daddr, 2248 struct in6_addr *saddr, 2249 __be16 dport, __be16 sport, 2250 struct dst_cache *dst_cache, 2251 const struct ip_tunnel_info *info) 2252{ 2253 bool use_cache = ip_tunnel_dst_cache_usable(skb, info); 2254 struct dst_entry *ndst; 2255 struct flowi6 fl6; 2256 int err; 2257 2258 if (!sock6) 2259 return ERR_PTR(-EIO); 2260 2261 if (tos && !info) 2262 use_cache = false; 2263 if (use_cache) { 2264 ndst = dst_cache_get_ip6(dst_cache, saddr); 2265 if (ndst) 2266 return ndst; 2267 } 2268 2269 memset(&fl6, 0, sizeof(fl6)); 2270 fl6.flowi6_oif = oif; 2271 fl6.daddr = *daddr; 2272 fl6.saddr = *saddr; 2273 fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label); 2274 fl6.flowi6_mark = skb->mark; 2275 fl6.flowi6_proto = IPPROTO_UDP; 2276 fl6.fl6_dport = dport; 2277 fl6.fl6_sport = sport; 2278 2279 err = ipv6_stub->ipv6_dst_lookup(vxlan->net, 2280 sock6->sock->sk, 2281 &ndst, &fl6); 2282 if (unlikely(err < 0)) { 2283 netdev_dbg(dev, "no route to %pI6\n", daddr); 2284 return ERR_PTR(-ENETUNREACH); 2285 } 2286 2287 if (unlikely(ndst->dev == dev)) { 2288 netdev_dbg(dev, "circular route to %pI6\n", daddr); 2289 dst_release(ndst); 2290 return ERR_PTR(-ELOOP); 2291 } 2292 2293 *saddr = fl6.saddr; 2294 if (use_cache) 2295 dst_cache_set_ip6(dst_cache, ndst, saddr); 2296 return ndst; 2297} 2298#endif 2299 2300/* Bypass encapsulation if the destination is local */ 2301static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan, 2302 struct vxlan_dev *dst_vxlan, __be32 vni) 2303{ 2304 struct pcpu_sw_netstats *tx_stats, *rx_stats; 2305 union vxlan_addr loopback; 2306 union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip; 2307 struct net_device *dev; 2308 int len = skb->len; 2309 2310 tx_stats = this_cpu_ptr(src_vxlan->dev->tstats); 2311 rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats); 2312 skb->pkt_type = PACKET_HOST; 2313 skb->encapsulation = 0; 2314 skb->dev = dst_vxlan->dev; 2315 __skb_pull(skb, skb_network_offset(skb)); 2316 2317 if (remote_ip->sa.sa_family == AF_INET) { 2318 loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2319 loopback.sa.sa_family = AF_INET; 2320#if IS_ENABLED(CONFIG_IPV6) 2321 } else { 2322 loopback.sin6.sin6_addr = in6addr_loopback; 2323 loopback.sa.sa_family = AF_INET6; 2324#endif 2325 } 2326 2327 rcu_read_lock(); 2328 dev = skb->dev; 2329 if (unlikely(!(dev->flags & IFF_UP))) { 2330 kfree_skb(skb); 2331 goto drop; 2332 } 2333 2334 if (dst_vxlan->cfg.flags & VXLAN_F_LEARN) 2335 vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni); 2336 2337 u64_stats_update_begin(&tx_stats->syncp); 2338 tx_stats->tx_packets++; 2339 tx_stats->tx_bytes += len; 2340 u64_stats_update_end(&tx_stats->syncp); 2341 2342 if (netif_rx(skb) == NET_RX_SUCCESS) { 2343 u64_stats_update_begin(&rx_stats->syncp); 2344 rx_stats->rx_packets++; 2345 rx_stats->rx_bytes += len; 2346 u64_stats_update_end(&rx_stats->syncp); 2347 } else { 2348drop: 2349 dev->stats.rx_dropped++; 2350 } 2351 rcu_read_unlock(); 2352} 2353 2354static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev, 2355 struct vxlan_dev *vxlan, 2356 union vxlan_addr *daddr, 2357 __be16 dst_port, int dst_ifindex, __be32 vni, 2358 struct dst_entry *dst, 2359 u32 rt_flags) 2360{ 2361#if IS_ENABLED(CONFIG_IPV6) 2362 /* IPv6 rt-flags are checked against RTF_LOCAL, but the value of 2363 * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple 2364 * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry. 2365 */ 2366 BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL); 2367#endif 2368 /* Bypass encapsulation if the destination is local */ 2369 if (rt_flags & RTCF_LOCAL && 2370 !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) { 2371 struct vxlan_dev *dst_vxlan; 2372 2373 dst_release(dst); 2374 dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni, 2375 daddr->sa.sa_family, dst_port, 2376 vxlan->cfg.flags); 2377 if (!dst_vxlan) { 2378 dev->stats.tx_errors++; 2379 kfree_skb(skb); 2380 2381 return -ENOENT; 2382 } 2383 vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni); 2384 return 1; 2385 } 2386 2387 return 0; 2388} 2389 2390static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, 2391 __be32 default_vni, struct vxlan_rdst *rdst, 2392 bool did_rsc) 2393{ 2394 struct dst_cache *dst_cache; 2395 struct ip_tunnel_info *info; 2396 struct vxlan_dev *vxlan = netdev_priv(dev); 2397 const struct iphdr *old_iph = ip_hdr(skb); 2398 union vxlan_addr *dst; 2399 union vxlan_addr remote_ip, local_ip; 2400 struct vxlan_metadata _md; 2401 struct vxlan_metadata *md = &_md; 2402 __be16 src_port = 0, dst_port; 2403 struct dst_entry *ndst = NULL; 2404 __be32 vni, label; 2405 __u8 tos, ttl; 2406 int ifindex; 2407 int err; 2408 u32 flags = vxlan->cfg.flags; 2409 bool udp_sum = false; 2410 bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev)); 2411 2412 info = skb_tunnel_info(skb); 2413 2414 if (rdst) { 2415 dst = &rdst->remote_ip; 2416 if (vxlan_addr_any(dst)) { 2417 if (did_rsc) { 2418 /* short-circuited back to local bridge */ 2419 vxlan_encap_bypass(skb, vxlan, vxlan, default_vni); 2420 return; 2421 } 2422 goto drop; 2423 } 2424 2425 dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port; 2426 vni = (rdst->remote_vni) ? : default_vni; 2427 ifindex = rdst->remote_ifindex; 2428 local_ip = vxlan->cfg.saddr; 2429 dst_cache = &rdst->dst_cache; 2430 md->gbp = skb->mark; 2431 if (flags & VXLAN_F_TTL_INHERIT) { 2432 ttl = ip_tunnel_get_ttl(old_iph, skb); 2433 } else { 2434 ttl = vxlan->cfg.ttl; 2435 if (!ttl && vxlan_addr_multicast(dst)) 2436 ttl = 1; 2437 } 2438 2439 tos = vxlan->cfg.tos; 2440 if (tos == 1) 2441 tos = ip_tunnel_get_dsfield(old_iph, skb); 2442 2443 if (dst->sa.sa_family == AF_INET) 2444 udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX); 2445 else 2446 udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX); 2447 label = vxlan->cfg.label; 2448 } else { 2449 if (!info) { 2450 WARN_ONCE(1, "%s: Missing encapsulation instructions\n", 2451 dev->name); 2452 goto drop; 2453 } 2454 remote_ip.sa.sa_family = ip_tunnel_info_af(info); 2455 if (remote_ip.sa.sa_family == AF_INET) { 2456 remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst; 2457 local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src; 2458 } else { 2459 remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst; 2460 local_ip.sin6.sin6_addr = info->key.u.ipv6.src; 2461 } 2462 dst = &remote_ip; 2463 dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port; 2464 vni = tunnel_id_to_key32(info->key.tun_id); 2465 ifindex = 0; 2466 dst_cache = &info->dst_cache; 2467 if (info->options_len && 2468 info->key.tun_flags & TUNNEL_VXLAN_OPT) 2469 md = ip_tunnel_info_opts(info); 2470 ttl = info->key.ttl; 2471 tos = info->key.tos; 2472 label = info->key.label; 2473 udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM); 2474 } 2475 src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, 2476 vxlan->cfg.port_max, true); 2477 2478 rcu_read_lock(); 2479 if (dst->sa.sa_family == AF_INET) { 2480 struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); 2481 struct rtable *rt; 2482 __be16 df = 0; 2483 2484 if (!ifindex) 2485 ifindex = sock4->sock->sk->sk_bound_dev_if; 2486 2487 rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos, 2488 dst->sin.sin_addr.s_addr, 2489 &local_ip.sin.sin_addr.s_addr, 2490 dst_port, src_port, 2491 dst_cache, info); 2492 if (IS_ERR(rt)) { 2493 err = PTR_ERR(rt); 2494 goto tx_error; 2495 } 2496 2497 if (!info) { 2498 /* Bypass encapsulation if the destination is local */ 2499 err = encap_bypass_if_local(skb, dev, vxlan, dst, 2500 dst_port, ifindex, vni, 2501 &rt->dst, rt->rt_flags); 2502 if (err) 2503 goto out_unlock; 2504 2505 if (vxlan->cfg.df == VXLAN_DF_SET) { 2506 df = htons(IP_DF); 2507 } else if (vxlan->cfg.df == VXLAN_DF_INHERIT) { 2508 struct ethhdr *eth = eth_hdr(skb); 2509 2510 if (ntohs(eth->h_proto) == ETH_P_IPV6 || 2511 (ntohs(eth->h_proto) == ETH_P_IP && 2512 old_iph->frag_off & htons(IP_DF))) 2513 df = htons(IP_DF); 2514 } 2515 } else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) { 2516 df = htons(IP_DF); 2517 } 2518 2519 ndst = &rt->dst; 2520 skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM); 2521 2522 tos = ip_tunnel_ecn_encap(tos, old_iph, skb); 2523 ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); 2524 err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr), 2525 vni, md, flags, udp_sum); 2526 if (err < 0) 2527 goto tx_error; 2528 2529 udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr, 2530 dst->sin.sin_addr.s_addr, tos, ttl, df, 2531 src_port, dst_port, xnet, !udp_sum); 2532#if IS_ENABLED(CONFIG_IPV6) 2533 } else { 2534 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); 2535 2536 if (!ifindex) 2537 ifindex = sock6->sock->sk->sk_bound_dev_if; 2538 2539 ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos, 2540 label, &dst->sin6.sin6_addr, 2541 &local_ip.sin6.sin6_addr, 2542 dst_port, src_port, 2543 dst_cache, info); 2544 if (IS_ERR(ndst)) { 2545 err = PTR_ERR(ndst); 2546 ndst = NULL; 2547 goto tx_error; 2548 } 2549 2550 if (!info) { 2551 u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags; 2552 2553 err = encap_bypass_if_local(skb, dev, vxlan, dst, 2554 dst_port, ifindex, vni, 2555 ndst, rt6i_flags); 2556 if (err) 2557 goto out_unlock; 2558 } 2559 2560 skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM); 2561 2562 tos = ip_tunnel_ecn_encap(tos, old_iph, skb); 2563 ttl = ttl ? : ip6_dst_hoplimit(ndst); 2564 skb_scrub_packet(skb, xnet); 2565 err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr), 2566 vni, md, flags, udp_sum); 2567 if (err < 0) 2568 goto tx_error; 2569 2570 udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, 2571 &local_ip.sin6.sin6_addr, 2572 &dst->sin6.sin6_addr, tos, ttl, 2573 label, src_port, dst_port, !udp_sum); 2574#endif 2575 } 2576out_unlock: 2577 rcu_read_unlock(); 2578 return; 2579 2580drop: 2581 dev->stats.tx_dropped++; 2582 dev_kfree_skb(skb); 2583 return; 2584 2585tx_error: 2586 rcu_read_unlock(); 2587 if (err == -ELOOP) 2588 dev->stats.collisions++; 2589 else if (err == -ENETUNREACH) 2590 dev->stats.tx_carrier_errors++; 2591 dst_release(ndst); 2592 dev->stats.tx_errors++; 2593 kfree_skb(skb); 2594} 2595 2596/* Transmit local packets over Vxlan 2597 * 2598 * Outer IP header inherits ECN and DF from inner header. 2599 * Outer UDP destination is the VXLAN assigned port. 2600 * source port is based on hash of flow 2601 */ 2602static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) 2603{ 2604 struct vxlan_dev *vxlan = netdev_priv(dev); 2605 struct vxlan_rdst *rdst, *fdst = NULL; 2606 const struct ip_tunnel_info *info; 2607 bool did_rsc = false; 2608 struct vxlan_fdb *f; 2609 struct ethhdr *eth; 2610 __be32 vni = 0; 2611 2612 info = skb_tunnel_info(skb); 2613 2614 skb_reset_mac_header(skb); 2615 2616 if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) { 2617 if (info && info->mode & IP_TUNNEL_INFO_BRIDGE && 2618 info->mode & IP_TUNNEL_INFO_TX) { 2619 vni = tunnel_id_to_key32(info->key.tun_id); 2620 } else { 2621 if (info && info->mode & IP_TUNNEL_INFO_TX) 2622 vxlan_xmit_one(skb, dev, vni, NULL, false); 2623 else 2624 kfree_skb(skb); 2625 return NETDEV_TX_OK; 2626 } 2627 } 2628 2629 if (vxlan->cfg.flags & VXLAN_F_PROXY) { 2630 eth = eth_hdr(skb); 2631 if (ntohs(eth->h_proto) == ETH_P_ARP) 2632 return arp_reduce(dev, skb, vni); 2633#if IS_ENABLED(CONFIG_IPV6) 2634 else if (ntohs(eth->h_proto) == ETH_P_IPV6 && 2635 pskb_may_pull(skb, sizeof(struct ipv6hdr) + 2636 sizeof(struct nd_msg)) && 2637 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) { 2638 struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1); 2639 2640 if (m->icmph.icmp6_code == 0 && 2641 m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) 2642 return neigh_reduce(dev, skb, vni); 2643 } 2644#endif 2645 } 2646 2647 eth = eth_hdr(skb); 2648 f = vxlan_find_mac(vxlan, eth->h_dest, vni); 2649 did_rsc = false; 2650 2651 if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) && 2652 (ntohs(eth->h_proto) == ETH_P_IP || 2653 ntohs(eth->h_proto) == ETH_P_IPV6)) { 2654 did_rsc = route_shortcircuit(dev, skb); 2655 if (did_rsc) 2656 f = vxlan_find_mac(vxlan, eth->h_dest, vni); 2657 } 2658 2659 if (f == NULL) { 2660 f = vxlan_find_mac(vxlan, all_zeros_mac, vni); 2661 if (f == NULL) { 2662 if ((vxlan->cfg.flags & VXLAN_F_L2MISS) && 2663 !is_multicast_ether_addr(eth->h_dest)) 2664 vxlan_fdb_miss(vxlan, eth->h_dest); 2665 2666 dev->stats.tx_dropped++; 2667 kfree_skb(skb); 2668 return NETDEV_TX_OK; 2669 } 2670 } 2671 2672 list_for_each_entry_rcu(rdst, &f->remotes, list) { 2673 struct sk_buff *skb1; 2674 2675 if (!fdst) { 2676 fdst = rdst; 2677 continue; 2678 } 2679 skb1 = skb_clone(skb, GFP_ATOMIC); 2680 if (skb1) 2681 vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); 2682 } 2683 2684 if (fdst) 2685 vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); 2686 else 2687 kfree_skb(skb); 2688 return NETDEV_TX_OK; 2689} 2690 2691/* Walk the forwarding table and purge stale entries */ 2692static void vxlan_cleanup(struct timer_list *t) 2693{ 2694 struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer); 2695 unsigned long next_timer = jiffies + FDB_AGE_INTERVAL; 2696 unsigned int h; 2697 2698 if (!netif_running(vxlan->dev)) 2699 return; 2700 2701 for (h = 0; h < FDB_HASH_SIZE; ++h) { 2702 struct hlist_node *p, *n; 2703 2704 spin_lock(&vxlan->hash_lock); 2705 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { 2706 struct vxlan_fdb *f 2707 = container_of(p, struct vxlan_fdb, hlist); 2708 unsigned long timeout; 2709 2710 if (f->state & (NUD_PERMANENT | NUD_NOARP)) 2711 continue; 2712 2713 if (f->flags & NTF_EXT_LEARNED) 2714 continue; 2715 2716 timeout = f->used + vxlan->cfg.age_interval * HZ; 2717 if (time_before_eq(timeout, jiffies)) { 2718 netdev_dbg(vxlan->dev, 2719 "garbage collect %pM\n", 2720 f->eth_addr); 2721 f->state = NUD_STALE; 2722 vxlan_fdb_destroy(vxlan, f, true, true); 2723 } else if (time_before(timeout, next_timer)) 2724 next_timer = timeout; 2725 } 2726 spin_unlock(&vxlan->hash_lock); 2727 } 2728 2729 mod_timer(&vxlan->age_timer, next_timer); 2730} 2731 2732static void vxlan_vs_del_dev(struct vxlan_dev *vxlan) 2733{ 2734 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2735 2736 spin_lock(&vn->sock_lock); 2737 hlist_del_init_rcu(&vxlan->hlist4.hlist); 2738#if IS_ENABLED(CONFIG_IPV6) 2739 hlist_del_init_rcu(&vxlan->hlist6.hlist); 2740#endif 2741 spin_unlock(&vn->sock_lock); 2742} 2743 2744static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan, 2745 struct vxlan_dev_node *node) 2746{ 2747 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2748 __be32 vni = vxlan->default_dst.remote_vni; 2749 2750 node->vxlan = vxlan; 2751 spin_lock(&vn->sock_lock); 2752 hlist_add_head_rcu(&node->hlist, vni_head(vs, vni)); 2753 spin_unlock(&vn->sock_lock); 2754} 2755 2756/* Setup stats when device is created */ 2757static int vxlan_init(struct net_device *dev) 2758{ 2759 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); 2760 if (!dev->tstats) 2761 return -ENOMEM; 2762 2763 return 0; 2764} 2765 2766static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni) 2767{ 2768 struct vxlan_fdb *f; 2769 2770 spin_lock_bh(&vxlan->hash_lock); 2771 f = __vxlan_find_mac(vxlan, all_zeros_mac, vni); 2772 if (f) 2773 vxlan_fdb_destroy(vxlan, f, true, true); 2774 spin_unlock_bh(&vxlan->hash_lock); 2775} 2776 2777static void vxlan_uninit(struct net_device *dev) 2778{ 2779 struct vxlan_dev *vxlan = netdev_priv(dev); 2780 2781 gro_cells_destroy(&vxlan->gro_cells); 2782 2783 vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni); 2784 2785 free_percpu(dev->tstats); 2786} 2787 2788/* Start ageing timer and join group when device is brought up */ 2789static int vxlan_open(struct net_device *dev) 2790{ 2791 struct vxlan_dev *vxlan = netdev_priv(dev); 2792 int ret; 2793 2794 ret = vxlan_sock_add(vxlan); 2795 if (ret < 0) 2796 return ret; 2797 2798 if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) { 2799 ret = vxlan_igmp_join(vxlan); 2800 if (ret == -EADDRINUSE) 2801 ret = 0; 2802 if (ret) { 2803 vxlan_sock_release(vxlan); 2804 return ret; 2805 } 2806 } 2807 2808 if (vxlan->cfg.age_interval) 2809 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL); 2810 2811 return ret; 2812} 2813 2814/* Purge the forwarding table */ 2815static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all) 2816{ 2817 unsigned int h; 2818 2819 spin_lock_bh(&vxlan->hash_lock); 2820 for (h = 0; h < FDB_HASH_SIZE; ++h) { 2821 struct hlist_node *p, *n; 2822 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) { 2823 struct vxlan_fdb *f 2824 = container_of(p, struct vxlan_fdb, hlist); 2825 if (!do_all && (f->state & (NUD_PERMANENT | NUD_NOARP))) 2826 continue; 2827 /* the all_zeros_mac entry is deleted at vxlan_uninit */ 2828 if (!is_zero_ether_addr(f->eth_addr)) 2829 vxlan_fdb_destroy(vxlan, f, true, true); 2830 } 2831 } 2832 spin_unlock_bh(&vxlan->hash_lock); 2833} 2834 2835/* Cleanup timer and forwarding table on shutdown */ 2836static int vxlan_stop(struct net_device *dev) 2837{ 2838 struct vxlan_dev *vxlan = netdev_priv(dev); 2839 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 2840 int ret = 0; 2841 2842 if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) && 2843 !vxlan_group_used(vn, vxlan)) 2844 ret = vxlan_igmp_leave(vxlan); 2845 2846 del_timer_sync(&vxlan->age_timer); 2847 2848 vxlan_flush(vxlan, false); 2849 vxlan_sock_release(vxlan); 2850 2851 return ret; 2852} 2853 2854/* Stub, nothing needs to be done. */ 2855static void vxlan_set_multicast_list(struct net_device *dev) 2856{ 2857} 2858 2859static int vxlan_change_mtu(struct net_device *dev, int new_mtu) 2860{ 2861 struct vxlan_dev *vxlan = netdev_priv(dev); 2862 struct vxlan_rdst *dst = &vxlan->default_dst; 2863 struct net_device *lowerdev = __dev_get_by_index(vxlan->net, 2864 dst->remote_ifindex); 2865 bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6); 2866 2867 /* This check is different than dev->max_mtu, because it looks at 2868 * the lowerdev->mtu, rather than the static dev->max_mtu 2869 */ 2870 if (lowerdev) { 2871 int max_mtu = lowerdev->mtu - 2872 (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM); 2873 if (new_mtu > max_mtu) 2874 return -EINVAL; 2875 } 2876 2877 dev->mtu = new_mtu; 2878 return 0; 2879} 2880 2881static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) 2882{ 2883 struct vxlan_dev *vxlan = netdev_priv(dev); 2884 struct ip_tunnel_info *info = skb_tunnel_info(skb); 2885 __be16 sport, dport; 2886 2887 sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min, 2888 vxlan->cfg.port_max, true); 2889 dport = info->key.tp_dst ? : vxlan->cfg.dst_port; 2890 2891 if (ip_tunnel_info_af(info) == AF_INET) { 2892 struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); 2893 struct rtable *rt; 2894 2895 rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos, 2896 info->key.u.ipv4.dst, 2897 &info->key.u.ipv4.src, dport, sport, 2898 &info->dst_cache, info); 2899 if (IS_ERR(rt)) 2900 return PTR_ERR(rt); 2901 ip_rt_put(rt); 2902 } else { 2903#if IS_ENABLED(CONFIG_IPV6) 2904 struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); 2905 struct dst_entry *ndst; 2906 2907 ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos, 2908 info->key.label, &info->key.u.ipv6.dst, 2909 &info->key.u.ipv6.src, dport, sport, 2910 &info->dst_cache, info); 2911 if (IS_ERR(ndst)) 2912 return PTR_ERR(ndst); 2913 dst_release(ndst); 2914#else /* !CONFIG_IPV6 */ 2915 return -EPFNOSUPPORT; 2916#endif 2917 } 2918 info->key.tp_src = sport; 2919 info->key.tp_dst = dport; 2920 return 0; 2921} 2922 2923static const struct net_device_ops vxlan_netdev_ether_ops = { 2924 .ndo_init = vxlan_init, 2925 .ndo_uninit = vxlan_uninit, 2926 .ndo_open = vxlan_open, 2927 .ndo_stop = vxlan_stop, 2928 .ndo_start_xmit = vxlan_xmit, 2929 .ndo_get_stats64 = ip_tunnel_get_stats64, 2930 .ndo_set_rx_mode = vxlan_set_multicast_list, 2931 .ndo_change_mtu = vxlan_change_mtu, 2932 .ndo_validate_addr = eth_validate_addr, 2933 .ndo_set_mac_address = eth_mac_addr, 2934 .ndo_fdb_add = vxlan_fdb_add, 2935 .ndo_fdb_del = vxlan_fdb_delete, 2936 .ndo_fdb_dump = vxlan_fdb_dump, 2937 .ndo_fdb_get = vxlan_fdb_get, 2938 .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, 2939 .ndo_change_proto_down = dev_change_proto_down_generic, 2940}; 2941 2942static const struct net_device_ops vxlan_netdev_raw_ops = { 2943 .ndo_init = vxlan_init, 2944 .ndo_uninit = vxlan_uninit, 2945 .ndo_open = vxlan_open, 2946 .ndo_stop = vxlan_stop, 2947 .ndo_start_xmit = vxlan_xmit, 2948 .ndo_get_stats64 = ip_tunnel_get_stats64, 2949 .ndo_change_mtu = vxlan_change_mtu, 2950 .ndo_fill_metadata_dst = vxlan_fill_metadata_dst, 2951}; 2952 2953/* Info for udev, that this is a virtual tunnel endpoint */ 2954static struct device_type vxlan_type = { 2955 .name = "vxlan", 2956}; 2957 2958/* Calls the ndo_udp_tunnel_add of the caller in order to 2959 * supply the listening VXLAN udp ports. Callers are expected 2960 * to implement the ndo_udp_tunnel_add. 2961 */ 2962static void vxlan_offload_rx_ports(struct net_device *dev, bool push) 2963{ 2964 struct vxlan_sock *vs; 2965 struct net *net = dev_net(dev); 2966 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 2967 unsigned int i; 2968 2969 spin_lock(&vn->sock_lock); 2970 for (i = 0; i < PORT_HASH_SIZE; ++i) { 2971 hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { 2972 unsigned short type; 2973 2974 if (vs->flags & VXLAN_F_GPE) 2975 type = UDP_TUNNEL_TYPE_VXLAN_GPE; 2976 else 2977 type = UDP_TUNNEL_TYPE_VXLAN; 2978 2979 if (push) 2980 udp_tunnel_push_rx_port(dev, vs->sock, type); 2981 else 2982 udp_tunnel_drop_rx_port(dev, vs->sock, type); 2983 } 2984 } 2985 spin_unlock(&vn->sock_lock); 2986} 2987 2988/* Initialize the device structure. */ 2989static void vxlan_setup(struct net_device *dev) 2990{ 2991 struct vxlan_dev *vxlan = netdev_priv(dev); 2992 unsigned int h; 2993 2994 eth_hw_addr_random(dev); 2995 ether_setup(dev); 2996 2997 dev->needs_free_netdev = true; 2998 SET_NETDEV_DEVTYPE(dev, &vxlan_type); 2999 3000 dev->features |= NETIF_F_LLTX; 3001 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM; 3002 dev->features |= NETIF_F_RXCSUM; 3003 dev->features |= NETIF_F_GSO_SOFTWARE; 3004 3005 dev->vlan_features = dev->features; 3006 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM; 3007 dev->hw_features |= NETIF_F_GSO_SOFTWARE; 3008 netif_keep_dst(dev); 3009 dev->priv_flags |= IFF_NO_QUEUE; 3010 3011 /* MTU range: 68 - 65535 */ 3012 dev->min_mtu = ETH_MIN_MTU; 3013 dev->max_mtu = ETH_MAX_MTU; 3014 3015 INIT_LIST_HEAD(&vxlan->next); 3016 spin_lock_init(&vxlan->hash_lock); 3017 3018 timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE); 3019 3020 vxlan->dev = dev; 3021 3022 gro_cells_init(&vxlan->gro_cells, dev); 3023 3024 for (h = 0; h < FDB_HASH_SIZE; ++h) 3025 INIT_HLIST_HEAD(&vxlan->fdb_head[h]); 3026} 3027 3028static void vxlan_ether_setup(struct net_device *dev) 3029{ 3030 dev->priv_flags &= ~IFF_TX_SKB_SHARING; 3031 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; 3032 dev->netdev_ops = &vxlan_netdev_ether_ops; 3033} 3034 3035static void vxlan_raw_setup(struct net_device *dev) 3036{ 3037 dev->header_ops = NULL; 3038 dev->type = ARPHRD_NONE; 3039 dev->hard_header_len = 0; 3040 dev->addr_len = 0; 3041 dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; 3042 dev->netdev_ops = &vxlan_netdev_raw_ops; 3043} 3044 3045static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { 3046 [IFLA_VXLAN_ID] = { .type = NLA_U32 }, 3047 [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, 3048 [IFLA_VXLAN_GROUP6] = { .len = sizeof(struct in6_addr) }, 3049 [IFLA_VXLAN_LINK] = { .type = NLA_U32 }, 3050 [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, 3051 [IFLA_VXLAN_LOCAL6] = { .len = sizeof(struct in6_addr) }, 3052 [IFLA_VXLAN_TOS] = { .type = NLA_U8 }, 3053 [IFLA_VXLAN_TTL] = { .type = NLA_U8 }, 3054 [IFLA_VXLAN_LABEL] = { .type = NLA_U32 }, 3055 [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 }, 3056 [IFLA_VXLAN_AGEING] = { .type = NLA_U32 }, 3057 [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 }, 3058 [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) }, 3059 [IFLA_VXLAN_PROXY] = { .type = NLA_U8 }, 3060 [IFLA_VXLAN_RSC] = { .type = NLA_U8 }, 3061 [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 }, 3062 [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 }, 3063 [IFLA_VXLAN_COLLECT_METADATA] = { .type = NLA_U8 }, 3064 [IFLA_VXLAN_PORT] = { .type = NLA_U16 }, 3065 [IFLA_VXLAN_UDP_CSUM] = { .type = NLA_U8 }, 3066 [IFLA_VXLAN_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, 3067 [IFLA_VXLAN_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, 3068 [IFLA_VXLAN_REMCSUM_TX] = { .type = NLA_U8 }, 3069 [IFLA_VXLAN_REMCSUM_RX] = { .type = NLA_U8 }, 3070 [IFLA_VXLAN_GBP] = { .type = NLA_FLAG, }, 3071 [IFLA_VXLAN_GPE] = { .type = NLA_FLAG, }, 3072 [IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG }, 3073 [IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG }, 3074 [IFLA_VXLAN_DF] = { .type = NLA_U8 }, 3075}; 3076 3077static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], 3078 struct netlink_ext_ack *extack) 3079{ 3080 if (tb[IFLA_ADDRESS]) { 3081 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { 3082 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], 3083 "Provided link layer address is not Ethernet"); 3084 return -EINVAL; 3085 } 3086 3087 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { 3088 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS], 3089 "Provided Ethernet address is not unicast"); 3090 return -EADDRNOTAVAIL; 3091 } 3092 } 3093 3094 if (tb[IFLA_MTU]) { 3095 u32 mtu = nla_get_u32(tb[IFLA_MTU]); 3096 3097 if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) { 3098 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], 3099 "MTU must be between 68 and 65535"); 3100 return -EINVAL; 3101 } 3102 } 3103 3104 if (!data) { 3105 NL_SET_ERR_MSG(extack, 3106 "Required attributes not provided to perform the operation"); 3107 return -EINVAL; 3108 } 3109 3110 if (data[IFLA_VXLAN_ID]) { 3111 u32 id = nla_get_u32(data[IFLA_VXLAN_ID]); 3112 3113 if (id >= VXLAN_N_VID) { 3114 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], 3115 "VXLAN ID must be lower than 16777216"); 3116 return -ERANGE; 3117 } 3118 } 3119 3120 if (data[IFLA_VXLAN_PORT_RANGE]) { 3121 const struct ifla_vxlan_port_range *p 3122 = nla_data(data[IFLA_VXLAN_PORT_RANGE]); 3123 3124 if (ntohs(p->high) < ntohs(p->low)) { 3125 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE], 3126 "Invalid source port range"); 3127 return -EINVAL; 3128 } 3129 } 3130 3131 if (data[IFLA_VXLAN_DF]) { 3132 enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]); 3133 3134 if (df < 0 || df > VXLAN_DF_MAX) { 3135 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_DF], 3136 "Invalid DF attribute"); 3137 return -EINVAL; 3138 } 3139 } 3140 3141 return 0; 3142} 3143 3144static void vxlan_get_drvinfo(struct net_device *netdev, 3145 struct ethtool_drvinfo *drvinfo) 3146{ 3147 strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version)); 3148 strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver)); 3149} 3150 3151static const struct ethtool_ops vxlan_ethtool_ops = { 3152 .get_drvinfo = vxlan_get_drvinfo, 3153 .get_link = ethtool_op_get_link, 3154}; 3155 3156static struct socket *vxlan_create_sock(struct net *net, bool ipv6, 3157 __be16 port, u32 flags, int ifindex) 3158{ 3159 struct socket *sock; 3160 struct udp_port_cfg udp_conf; 3161 int err; 3162 3163 memset(&udp_conf, 0, sizeof(udp_conf)); 3164 3165 if (ipv6) { 3166 udp_conf.family = AF_INET6; 3167 udp_conf.use_udp6_rx_checksums = 3168 !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX); 3169 udp_conf.ipv6_v6only = 1; 3170 } else { 3171 udp_conf.family = AF_INET; 3172 } 3173 3174 udp_conf.local_udp_port = port; 3175 udp_conf.bind_ifindex = ifindex; 3176 3177 /* Open UDP socket */ 3178 err = udp_sock_create(net, &udp_conf, &sock); 3179 if (err < 0) 3180 return ERR_PTR(err); 3181 3182 return sock; 3183} 3184 3185/* Create new listen socket if needed */ 3186static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6, 3187 __be16 port, u32 flags, 3188 int ifindex) 3189{ 3190 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 3191 struct vxlan_sock *vs; 3192 struct socket *sock; 3193 unsigned int h; 3194 struct udp_tunnel_sock_cfg tunnel_cfg; 3195 3196 vs = kzalloc(sizeof(*vs), GFP_KERNEL); 3197 if (!vs) 3198 return ERR_PTR(-ENOMEM); 3199 3200 for (h = 0; h < VNI_HASH_SIZE; ++h) 3201 INIT_HLIST_HEAD(&vs->vni_list[h]); 3202 3203 sock = vxlan_create_sock(net, ipv6, port, flags, ifindex); 3204 if (IS_ERR(sock)) { 3205 kfree(vs); 3206 return ERR_CAST(sock); 3207 } 3208 3209 vs->sock = sock; 3210 refcount_set(&vs->refcnt, 1); 3211 vs->flags = (flags & VXLAN_F_RCV_FLAGS); 3212 3213 spin_lock(&vn->sock_lock); 3214 hlist_add_head_rcu(&vs->hlist, vs_head(net, port)); 3215 udp_tunnel_notify_add_rx_port(sock, 3216 (vs->flags & VXLAN_F_GPE) ? 3217 UDP_TUNNEL_TYPE_VXLAN_GPE : 3218 UDP_TUNNEL_TYPE_VXLAN); 3219 spin_unlock(&vn->sock_lock); 3220 3221 /* Mark socket as an encapsulation socket. */ 3222 memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); 3223 tunnel_cfg.sk_user_data = vs; 3224 tunnel_cfg.encap_type = 1; 3225 tunnel_cfg.encap_rcv = vxlan_rcv; 3226 tunnel_cfg.encap_err_lookup = vxlan_err_lookup; 3227 tunnel_cfg.encap_destroy = NULL; 3228 tunnel_cfg.gro_receive = vxlan_gro_receive; 3229 tunnel_cfg.gro_complete = vxlan_gro_complete; 3230 3231 setup_udp_tunnel_sock(net, sock, &tunnel_cfg); 3232 3233 return vs; 3234} 3235 3236static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6) 3237{ 3238 struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id); 3239 struct vxlan_sock *vs = NULL; 3240 struct vxlan_dev_node *node; 3241 int l3mdev_index = 0; 3242 3243 if (vxlan->cfg.remote_ifindex) 3244 l3mdev_index = l3mdev_master_upper_ifindex_by_index( 3245 vxlan->net, vxlan->cfg.remote_ifindex); 3246 3247 if (!vxlan->cfg.no_share) { 3248 spin_lock(&vn->sock_lock); 3249 vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET, 3250 vxlan->cfg.dst_port, vxlan->cfg.flags, 3251 l3mdev_index); 3252 if (vs && !refcount_inc_not_zero(&vs->refcnt)) { 3253 spin_unlock(&vn->sock_lock); 3254 return -EBUSY; 3255 } 3256 spin_unlock(&vn->sock_lock); 3257 } 3258 if (!vs) 3259 vs = vxlan_socket_create(vxlan->net, ipv6, 3260 vxlan->cfg.dst_port, vxlan->cfg.flags, 3261 l3mdev_index); 3262 if (IS_ERR(vs)) 3263 return PTR_ERR(vs); 3264#if IS_ENABLED(CONFIG_IPV6) 3265 if (ipv6) { 3266 rcu_assign_pointer(vxlan->vn6_sock, vs); 3267 node = &vxlan->hlist6; 3268 } else 3269#endif 3270 { 3271 rcu_assign_pointer(vxlan->vn4_sock, vs); 3272 node = &vxlan->hlist4; 3273 } 3274 vxlan_vs_add_dev(vs, vxlan, node); 3275 return 0; 3276} 3277 3278static int vxlan_sock_add(struct vxlan_dev *vxlan) 3279{ 3280 bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA; 3281 bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata; 3282 bool ipv4 = !ipv6 || metadata; 3283 int ret = 0; 3284 3285 RCU_INIT_POINTER(vxlan->vn4_sock, NULL); 3286#if IS_ENABLED(CONFIG_IPV6) 3287 RCU_INIT_POINTER(vxlan->vn6_sock, NULL); 3288 if (ipv6) { 3289 ret = __vxlan_sock_add(vxlan, true); 3290 if (ret < 0 && ret != -EAFNOSUPPORT) 3291 ipv4 = false; 3292 } 3293#endif 3294 if (ipv4) 3295 ret = __vxlan_sock_add(vxlan, false); 3296 if (ret < 0) 3297 vxlan_sock_release(vxlan); 3298 return ret; 3299} 3300 3301static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf, 3302 struct net_device **lower, 3303 struct vxlan_dev *old, 3304 struct netlink_ext_ack *extack) 3305{ 3306 struct vxlan_net *vn = net_generic(src_net, vxlan_net_id); 3307 struct vxlan_dev *tmp; 3308 bool use_ipv6 = false; 3309 3310 if (conf->flags & VXLAN_F_GPE) { 3311 /* For now, allow GPE only together with 3312 * COLLECT_METADATA. This can be relaxed later; in such 3313 * case, the other side of the PtP link will have to be 3314 * provided. 3315 */ 3316 if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) || 3317 !(conf->flags & VXLAN_F_COLLECT_METADATA)) { 3318 NL_SET_ERR_MSG(extack, 3319 "VXLAN GPE does not support this combination of attributes"); 3320 return -EINVAL; 3321 } 3322 } 3323 3324 if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) { 3325 /* Unless IPv6 is explicitly requested, assume IPv4 */ 3326 conf->remote_ip.sa.sa_family = AF_INET; 3327 conf->saddr.sa.sa_family = AF_INET; 3328 } else if (!conf->remote_ip.sa.sa_family) { 3329 conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family; 3330 } else if (!conf->saddr.sa.sa_family) { 3331 conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family; 3332 } 3333 3334 if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) { 3335 NL_SET_ERR_MSG(extack, 3336 "Local and remote address must be from the same family"); 3337 return -EINVAL; 3338 } 3339 3340 if (vxlan_addr_multicast(&conf->saddr)) { 3341 NL_SET_ERR_MSG(extack, "Local address cannot be multicast"); 3342 return -EINVAL; 3343 } 3344 3345 if (conf->saddr.sa.sa_family == AF_INET6) { 3346 if (!IS_ENABLED(CONFIG_IPV6)) { 3347 NL_SET_ERR_MSG(extack, 3348 "IPv6 support not enabled in the kernel"); 3349 return -EPFNOSUPPORT; 3350 } 3351 use_ipv6 = true; 3352 conf->flags |= VXLAN_F_IPV6; 3353 3354 if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) { 3355 int local_type = 3356 ipv6_addr_type(&conf->saddr.sin6.sin6_addr); 3357 int remote_type = 3358 ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr); 3359 3360 if (local_type & IPV6_ADDR_LINKLOCAL) { 3361 if (!(remote_type & IPV6_ADDR_LINKLOCAL) && 3362 (remote_type != IPV6_ADDR_ANY)) { 3363 NL_SET_ERR_MSG(extack, 3364 "Invalid combination of local and remote address scopes"); 3365 return -EINVAL; 3366 } 3367 3368 conf->flags |= VXLAN_F_IPV6_LINKLOCAL; 3369 } else { 3370 if (remote_type == 3371 (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) { 3372 NL_SET_ERR_MSG(extack, 3373 "Invalid combination of local and remote address scopes"); 3374 return -EINVAL; 3375 } 3376 3377 conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL; 3378 } 3379 } 3380 } 3381 3382 if (conf->label && !use_ipv6) { 3383 NL_SET_ERR_MSG(extack, 3384 "Label attribute only applies to IPv6 VXLAN devices"); 3385 return -EINVAL; 3386 } 3387 3388 if (conf->remote_ifindex) { 3389 struct net_device *lowerdev; 3390 3391 lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex); 3392 if (!lowerdev) { 3393 NL_SET_ERR_MSG(extack, 3394 "Invalid local interface, device not found"); 3395 return -ENODEV; 3396 } 3397 3398#if IS_ENABLED(CONFIG_IPV6) 3399 if (use_ipv6) { 3400 struct inet6_dev *idev = __in6_dev_get(lowerdev); 3401 if (idev && idev->cnf.disable_ipv6) { 3402 NL_SET_ERR_MSG(extack, 3403 "IPv6 support disabled by administrator"); 3404 return -EPERM; 3405 } 3406 } 3407#endif 3408 3409 *lower = lowerdev; 3410 } else { 3411 if (vxlan_addr_multicast(&conf->remote_ip)) { 3412 NL_SET_ERR_MSG(extack, 3413 "Local interface required for multicast remote destination"); 3414 3415 return -EINVAL; 3416 } 3417 3418#if IS_ENABLED(CONFIG_IPV6) 3419 if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) { 3420 NL_SET_ERR_MSG(extack, 3421 "Local interface required for link-local local/remote addresses"); 3422 return -EINVAL; 3423 } 3424#endif 3425 3426 *lower = NULL; 3427 } 3428 3429 if (!conf->dst_port) { 3430 if (conf->flags & VXLAN_F_GPE) 3431 conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */ 3432 else 3433 conf->dst_port = htons(vxlan_port); 3434 } 3435 3436 if (!conf->age_interval) 3437 conf->age_interval = FDB_AGE_DEFAULT; 3438 3439 list_for_each_entry(tmp, &vn->vxlan_list, next) { 3440 if (tmp == old) 3441 continue; 3442 3443 if (tmp->cfg.vni != conf->vni) 3444 continue; 3445 if (tmp->cfg.dst_port != conf->dst_port) 3446 continue; 3447 if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) != 3448 (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6))) 3449 continue; 3450 3451 if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) && 3452 tmp->cfg.remote_ifindex != conf->remote_ifindex) 3453 continue; 3454 3455 NL_SET_ERR_MSG(extack, 3456 "A VXLAN device with the specified VNI already exists"); 3457 return -EEXIST; 3458 } 3459 3460 return 0; 3461} 3462 3463static void vxlan_config_apply(struct net_device *dev, 3464 struct vxlan_config *conf, 3465 struct net_device *lowerdev, 3466 struct net *src_net, 3467 bool changelink) 3468{ 3469 struct vxlan_dev *vxlan = netdev_priv(dev); 3470 struct vxlan_rdst *dst = &vxlan->default_dst; 3471 unsigned short needed_headroom = ETH_HLEN; 3472 bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6); 3473 int max_mtu = ETH_MAX_MTU; 3474 3475 if (!changelink) { 3476 if (conf->flags & VXLAN_F_GPE) 3477 vxlan_raw_setup(dev); 3478 else 3479 vxlan_ether_setup(dev); 3480 3481 if (conf->mtu) 3482 dev->mtu = conf->mtu; 3483 3484 vxlan->net = src_net; 3485 } 3486 3487 dst->remote_vni = conf->vni; 3488 3489 memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip)); 3490 3491 if (lowerdev) { 3492 dst->remote_ifindex = conf->remote_ifindex; 3493 3494 dev->gso_max_size = lowerdev->gso_max_size; 3495 dev->gso_max_segs = lowerdev->gso_max_segs; 3496 3497 needed_headroom = lowerdev->hard_header_len; 3498 3499 max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : 3500 VXLAN_HEADROOM); 3501 if (max_mtu < ETH_MIN_MTU) 3502 max_mtu = ETH_MIN_MTU; 3503 3504 if (!changelink && !conf->mtu) 3505 dev->mtu = max_mtu; 3506 } 3507 3508 if (dev->mtu > max_mtu) 3509 dev->mtu = max_mtu; 3510 3511 if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA) 3512 needed_headroom += VXLAN6_HEADROOM; 3513 else 3514 needed_headroom += VXLAN_HEADROOM; 3515 dev->needed_headroom = needed_headroom; 3516 3517 memcpy(&vxlan->cfg, conf, sizeof(*conf)); 3518} 3519 3520static int vxlan_dev_configure(struct net *src_net, struct net_device *dev, 3521 struct vxlan_config *conf, bool changelink, 3522 struct netlink_ext_ack *extack) 3523{ 3524 struct vxlan_dev *vxlan = netdev_priv(dev); 3525 struct net_device *lowerdev; 3526 int ret; 3527 3528 ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack); 3529 if (ret) 3530 return ret; 3531 3532 vxlan_config_apply(dev, conf, lowerdev, src_net, changelink); 3533 3534 return 0; 3535} 3536 3537static int __vxlan_dev_create(struct net *net, struct net_device *dev, 3538 struct vxlan_config *conf, 3539 struct netlink_ext_ack *extack) 3540{ 3541 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 3542 struct vxlan_dev *vxlan = netdev_priv(dev); 3543 struct vxlan_fdb *f = NULL; 3544 bool unregister = false; 3545 int err; 3546 3547 err = vxlan_dev_configure(net, dev, conf, false, extack); 3548 if (err) 3549 return err; 3550 3551 dev->ethtool_ops = &vxlan_ethtool_ops; 3552 3553 /* create an fdb entry for a valid default destination */ 3554 if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { 3555 err = vxlan_fdb_create(vxlan, all_zeros_mac, 3556 &vxlan->default_dst.remote_ip, 3557 NUD_REACHABLE | NUD_PERMANENT, 3558 vxlan->cfg.dst_port, 3559 vxlan->default_dst.remote_vni, 3560 vxlan->default_dst.remote_vni, 3561 vxlan->default_dst.remote_ifindex, 3562 NTF_SELF, &f); 3563 if (err) 3564 return err; 3565 } 3566 3567 err = register_netdevice(dev); 3568 if (err) 3569 goto errout; 3570 unregister = true; 3571 3572 err = rtnl_configure_link(dev, NULL); 3573 if (err) 3574 goto errout; 3575 3576 /* notify default fdb entry */ 3577 if (f) { 3578 err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), 3579 RTM_NEWNEIGH, true, extack); 3580 if (err) 3581 goto errout; 3582 } 3583 3584 list_add(&vxlan->next, &vn->vxlan_list); 3585 return 0; 3586 3587errout: 3588 /* unregister_netdevice() destroys the default FDB entry with deletion 3589 * notification. But the addition notification was not sent yet, so 3590 * destroy the entry by hand here. 3591 */ 3592 if (f) 3593 vxlan_fdb_destroy(vxlan, f, false, false); 3594 if (unregister) 3595 unregister_netdevice(dev); 3596 return err; 3597} 3598 3599/* Set/clear flags based on attribute */ 3600static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[], 3601 int attrtype, unsigned long mask, bool changelink, 3602 bool changelink_supported, 3603 struct netlink_ext_ack *extack) 3604{ 3605 unsigned long flags; 3606 3607 if (!tb[attrtype]) 3608 return 0; 3609 3610 if (changelink && !changelink_supported) { 3611 vxlan_flag_attr_error(attrtype, extack); 3612 return -EOPNOTSUPP; 3613 } 3614 3615 if (vxlan_policy[attrtype].type == NLA_FLAG) 3616 flags = conf->flags | mask; 3617 else if (nla_get_u8(tb[attrtype])) 3618 flags = conf->flags | mask; 3619 else 3620 flags = conf->flags & ~mask; 3621 3622 conf->flags = flags; 3623 3624 return 0; 3625} 3626 3627static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], 3628 struct net_device *dev, struct vxlan_config *conf, 3629 bool changelink, struct netlink_ext_ack *extack) 3630{ 3631 struct vxlan_dev *vxlan = netdev_priv(dev); 3632 int err = 0; 3633 3634 memset(conf, 0, sizeof(*conf)); 3635 3636 /* if changelink operation, start with old existing cfg */ 3637 if (changelink) 3638 memcpy(conf, &vxlan->cfg, sizeof(*conf)); 3639 3640 if (data[IFLA_VXLAN_ID]) { 3641 __be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); 3642 3643 if (changelink && (vni != conf->vni)) { 3644 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI"); 3645 return -EOPNOTSUPP; 3646 } 3647 conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID])); 3648 } 3649 3650 if (data[IFLA_VXLAN_GROUP]) { 3651 if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) { 3652 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group"); 3653 return -EOPNOTSUPP; 3654 } 3655 3656 conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]); 3657 conf->remote_ip.sa.sa_family = AF_INET; 3658 } else if (data[IFLA_VXLAN_GROUP6]) { 3659 if (!IS_ENABLED(CONFIG_IPV6)) { 3660 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel"); 3661 return -EPFNOSUPPORT; 3662 } 3663 3664 if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) { 3665 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group"); 3666 return -EOPNOTSUPP; 3667 } 3668 3669 conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]); 3670 conf->remote_ip.sa.sa_family = AF_INET6; 3671 } 3672 3673 if (data[IFLA_VXLAN_LOCAL]) { 3674 if (changelink && (conf->saddr.sa.sa_family != AF_INET)) { 3675 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old"); 3676 return -EOPNOTSUPP; 3677 } 3678 3679 conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]); 3680 conf->saddr.sa.sa_family = AF_INET; 3681 } else if (data[IFLA_VXLAN_LOCAL6]) { 3682 if (!IS_ENABLED(CONFIG_IPV6)) { 3683 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel"); 3684 return -EPFNOSUPPORT; 3685 } 3686 3687 if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) { 3688 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old"); 3689 return -EOPNOTSUPP; 3690 } 3691 3692 /* TODO: respect scope id */ 3693 conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]); 3694 conf->saddr.sa.sa_family = AF_INET6; 3695 } 3696 3697 if (data[IFLA_VXLAN_LINK]) 3698 conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]); 3699 3700 if (data[IFLA_VXLAN_TOS]) 3701 conf->tos = nla_get_u8(data[IFLA_VXLAN_TOS]); 3702 3703 if (data[IFLA_VXLAN_TTL]) 3704 conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]); 3705 3706 if (data[IFLA_VXLAN_TTL_INHERIT]) { 3707 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT, 3708 VXLAN_F_TTL_INHERIT, changelink, false, 3709 extack); 3710 if (err) 3711 return err; 3712 3713 } 3714 3715 if (data[IFLA_VXLAN_LABEL]) 3716 conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) & 3717 IPV6_FLOWLABEL_MASK; 3718 3719 if (data[IFLA_VXLAN_LEARNING]) { 3720 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING, 3721 VXLAN_F_LEARN, changelink, true, 3722 extack); 3723 if (err) 3724 return err; 3725 } else if (!changelink) { 3726 /* default to learn on a new device */ 3727 conf->flags |= VXLAN_F_LEARN; 3728 } 3729 3730 if (data[IFLA_VXLAN_AGEING]) 3731 conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]); 3732 3733 if (data[IFLA_VXLAN_PROXY]) { 3734 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY, 3735 VXLAN_F_PROXY, changelink, false, 3736 extack); 3737 if (err) 3738 return err; 3739 } 3740 3741 if (data[IFLA_VXLAN_RSC]) { 3742 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC, 3743 VXLAN_F_RSC, changelink, false, 3744 extack); 3745 if (err) 3746 return err; 3747 } 3748 3749 if (data[IFLA_VXLAN_L2MISS]) { 3750 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS, 3751 VXLAN_F_L2MISS, changelink, false, 3752 extack); 3753 if (err) 3754 return err; 3755 } 3756 3757 if (data[IFLA_VXLAN_L3MISS]) { 3758 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS, 3759 VXLAN_F_L3MISS, changelink, false, 3760 extack); 3761 if (err) 3762 return err; 3763 } 3764 3765 if (data[IFLA_VXLAN_LIMIT]) { 3766 if (changelink) { 3767 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT], 3768 "Cannot change limit"); 3769 return -EOPNOTSUPP; 3770 } 3771 conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]); 3772 } 3773 3774 if (data[IFLA_VXLAN_COLLECT_METADATA]) { 3775 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA, 3776 VXLAN_F_COLLECT_METADATA, changelink, false, 3777 extack); 3778 if (err) 3779 return err; 3780 } 3781 3782 if (data[IFLA_VXLAN_PORT_RANGE]) { 3783 if (!changelink) { 3784 const struct ifla_vxlan_port_range *p 3785 = nla_data(data[IFLA_VXLAN_PORT_RANGE]); 3786 conf->port_min = ntohs(p->low); 3787 conf->port_max = ntohs(p->high); 3788 } else { 3789 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE], 3790 "Cannot change port range"); 3791 return -EOPNOTSUPP; 3792 } 3793 } 3794 3795 if (data[IFLA_VXLAN_PORT]) { 3796 if (changelink) { 3797 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT], 3798 "Cannot change port"); 3799 return -EOPNOTSUPP; 3800 } 3801 conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]); 3802 } 3803 3804 if (data[IFLA_VXLAN_UDP_CSUM]) { 3805 if (changelink) { 3806 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM], 3807 "Cannot change UDP_CSUM flag"); 3808 return -EOPNOTSUPP; 3809 } 3810 if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM])) 3811 conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX; 3812 } 3813 3814 if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) { 3815 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, 3816 VXLAN_F_UDP_ZERO_CSUM6_TX, changelink, 3817 false, extack); 3818 if (err) 3819 return err; 3820 } 3821 3822 if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) { 3823 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 3824 VXLAN_F_UDP_ZERO_CSUM6_RX, changelink, 3825 false, extack); 3826 if (err) 3827 return err; 3828 } 3829 3830 if (data[IFLA_VXLAN_REMCSUM_TX]) { 3831 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX, 3832 VXLAN_F_REMCSUM_TX, changelink, false, 3833 extack); 3834 if (err) 3835 return err; 3836 } 3837 3838 if (data[IFLA_VXLAN_REMCSUM_RX]) { 3839 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX, 3840 VXLAN_F_REMCSUM_RX, changelink, false, 3841 extack); 3842 if (err) 3843 return err; 3844 } 3845 3846 if (data[IFLA_VXLAN_GBP]) { 3847 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP, 3848 VXLAN_F_GBP, changelink, false, extack); 3849 if (err) 3850 return err; 3851 } 3852 3853 if (data[IFLA_VXLAN_GPE]) { 3854 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE, 3855 VXLAN_F_GPE, changelink, false, 3856 extack); 3857 if (err) 3858 return err; 3859 } 3860 3861 if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) { 3862 err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL, 3863 VXLAN_F_REMCSUM_NOPARTIAL, changelink, 3864 false, extack); 3865 if (err) 3866 return err; 3867 } 3868 3869 if (tb[IFLA_MTU]) { 3870 if (changelink) { 3871 NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], 3872 "Cannot change mtu"); 3873 return -EOPNOTSUPP; 3874 } 3875 conf->mtu = nla_get_u32(tb[IFLA_MTU]); 3876 } 3877 3878 if (data[IFLA_VXLAN_DF]) 3879 conf->df = nla_get_u8(data[IFLA_VXLAN_DF]); 3880 3881 return 0; 3882} 3883 3884static int vxlan_newlink(struct net *src_net, struct net_device *dev, 3885 struct nlattr *tb[], struct nlattr *data[], 3886 struct netlink_ext_ack *extack) 3887{ 3888 struct vxlan_config conf; 3889 int err; 3890 3891 err = vxlan_nl2conf(tb, data, dev, &conf, false, extack); 3892 if (err) 3893 return err; 3894 3895 return __vxlan_dev_create(src_net, dev, &conf, extack); 3896} 3897 3898static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], 3899 struct nlattr *data[], 3900 struct netlink_ext_ack *extack) 3901{ 3902 struct vxlan_dev *vxlan = netdev_priv(dev); 3903 struct vxlan_rdst *dst = &vxlan->default_dst; 3904 struct net_device *lowerdev; 3905 struct vxlan_config conf; 3906 int err; 3907 3908 err = vxlan_nl2conf(tb, data, dev, &conf, true, extack); 3909 if (err) 3910 return err; 3911 3912 err = vxlan_config_validate(vxlan->net, &conf, &lowerdev, 3913 vxlan, extack); 3914 if (err) 3915 return err; 3916 3917 /* handle default dst entry */ 3918 if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) { 3919 spin_lock_bh(&vxlan->hash_lock); 3920 if (!vxlan_addr_any(&conf.remote_ip)) { 3921 err = vxlan_fdb_update(vxlan, all_zeros_mac, 3922 &conf.remote_ip, 3923 NUD_REACHABLE | NUD_PERMANENT, 3924 NLM_F_APPEND | NLM_F_CREATE, 3925 vxlan->cfg.dst_port, 3926 conf.vni, conf.vni, 3927 conf.remote_ifindex, 3928 NTF_SELF, true, extack); 3929 if (err) { 3930 spin_unlock_bh(&vxlan->hash_lock); 3931 return err; 3932 } 3933 } 3934 if (!vxlan_addr_any(&dst->remote_ip)) 3935 __vxlan_fdb_delete(vxlan, all_zeros_mac, 3936 dst->remote_ip, 3937 vxlan->cfg.dst_port, 3938 dst->remote_vni, 3939 dst->remote_vni, 3940 dst->remote_ifindex, 3941 true); 3942 spin_unlock_bh(&vxlan->hash_lock); 3943 } 3944 3945 if (conf.age_interval != vxlan->cfg.age_interval) 3946 mod_timer(&vxlan->age_timer, jiffies); 3947 3948 vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true); 3949 return 0; 3950} 3951 3952static void vxlan_dellink(struct net_device *dev, struct list_head *head) 3953{ 3954 struct vxlan_dev *vxlan = netdev_priv(dev); 3955 3956 vxlan_flush(vxlan, true); 3957 3958 list_del(&vxlan->next); 3959 unregister_netdevice_queue(dev, head); 3960} 3961 3962static size_t vxlan_get_size(const struct net_device *dev) 3963{ 3964 3965 return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */ 3966 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */ 3967 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */ 3968 nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */ 3969 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */ 3970 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL_INHERIT */ 3971 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */ 3972 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_DF */ 3973 nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */ 3974 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */ 3975 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */ 3976 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */ 3977 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */ 3978 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */ 3979 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_COLLECT_METADATA */ 3980 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */ 3981 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */ 3982 nla_total_size(sizeof(struct ifla_vxlan_port_range)) + 3983 nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */ 3984 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */ 3985 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */ 3986 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */ 3987 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */ 3988 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */ 3989 0; 3990} 3991 3992static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev) 3993{ 3994 const struct vxlan_dev *vxlan = netdev_priv(dev); 3995 const struct vxlan_rdst *dst = &vxlan->default_dst; 3996 struct ifla_vxlan_port_range ports = { 3997 .low = htons(vxlan->cfg.port_min), 3998 .high = htons(vxlan->cfg.port_max), 3999 }; 4000 4001 if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni))) 4002 goto nla_put_failure; 4003 4004 if (!vxlan_addr_any(&dst->remote_ip)) { 4005 if (dst->remote_ip.sa.sa_family == AF_INET) { 4006 if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP, 4007 dst->remote_ip.sin.sin_addr.s_addr)) 4008 goto nla_put_failure; 4009#if IS_ENABLED(CONFIG_IPV6) 4010 } else { 4011 if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6, 4012 &dst->remote_ip.sin6.sin6_addr)) 4013 goto nla_put_failure; 4014#endif 4015 } 4016 } 4017 4018 if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex)) 4019 goto nla_put_failure; 4020 4021 if (!vxlan_addr_any(&vxlan->cfg.saddr)) { 4022 if (vxlan->cfg.saddr.sa.sa_family == AF_INET) { 4023 if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL, 4024 vxlan->cfg.saddr.sin.sin_addr.s_addr)) 4025 goto nla_put_failure; 4026#if IS_ENABLED(CONFIG_IPV6) 4027 } else { 4028 if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6, 4029 &vxlan->cfg.saddr.sin6.sin6_addr)) 4030 goto nla_put_failure; 4031#endif 4032 } 4033 } 4034 4035 if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) || 4036 nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT, 4037 !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) || 4038 nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) || 4039 nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) || 4040 nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) || 4041 nla_put_u8(skb, IFLA_VXLAN_LEARNING, 4042 !!(vxlan->cfg.flags & VXLAN_F_LEARN)) || 4043 nla_put_u8(skb, IFLA_VXLAN_PROXY, 4044 !!(vxlan->cfg.flags & VXLAN_F_PROXY)) || 4045 nla_put_u8(skb, IFLA_VXLAN_RSC, 4046 !!(vxlan->cfg.flags & VXLAN_F_RSC)) || 4047 nla_put_u8(skb, IFLA_VXLAN_L2MISS, 4048 !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) || 4049 nla_put_u8(skb, IFLA_VXLAN_L3MISS, 4050 !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) || 4051 nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA, 4052 !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) || 4053 nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) || 4054 nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) || 4055 nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) || 4056 nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM, 4057 !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) || 4058 nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX, 4059 !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) || 4060 nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX, 4061 !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) || 4062 nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX, 4063 !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) || 4064 nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX, 4065 !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX))) 4066 goto nla_put_failure; 4067 4068 if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports)) 4069 goto nla_put_failure; 4070 4071 if (vxlan->cfg.flags & VXLAN_F_GBP && 4072 nla_put_flag(skb, IFLA_VXLAN_GBP)) 4073 goto nla_put_failure; 4074 4075 if (vxlan->cfg.flags & VXLAN_F_GPE && 4076 nla_put_flag(skb, IFLA_VXLAN_GPE)) 4077 goto nla_put_failure; 4078 4079 if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL && 4080 nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL)) 4081 goto nla_put_failure; 4082 4083 return 0; 4084 4085nla_put_failure: 4086 return -EMSGSIZE; 4087} 4088 4089static struct net *vxlan_get_link_net(const struct net_device *dev) 4090{ 4091 struct vxlan_dev *vxlan = netdev_priv(dev); 4092 4093 return vxlan->net; 4094} 4095 4096static struct rtnl_link_ops vxlan_link_ops __read_mostly = { 4097 .kind = "vxlan", 4098 .maxtype = IFLA_VXLAN_MAX, 4099 .policy = vxlan_policy, 4100 .priv_size = sizeof(struct vxlan_dev), 4101 .setup = vxlan_setup, 4102 .validate = vxlan_validate, 4103 .newlink = vxlan_newlink, 4104 .changelink = vxlan_changelink, 4105 .dellink = vxlan_dellink, 4106 .get_size = vxlan_get_size, 4107 .fill_info = vxlan_fill_info, 4108 .get_link_net = vxlan_get_link_net, 4109}; 4110 4111struct net_device *vxlan_dev_create(struct net *net, const char *name, 4112 u8 name_assign_type, 4113 struct vxlan_config *conf) 4114{ 4115 struct nlattr *tb[IFLA_MAX + 1]; 4116 struct net_device *dev; 4117 int err; 4118 4119 memset(&tb, 0, sizeof(tb)); 4120 4121 dev = rtnl_create_link(net, name, name_assign_type, 4122 &vxlan_link_ops, tb, NULL); 4123 if (IS_ERR(dev)) 4124 return dev; 4125 4126 err = __vxlan_dev_create(net, dev, conf, NULL); 4127 if (err < 0) { 4128 free_netdev(dev); 4129 return ERR_PTR(err); 4130 } 4131 4132 err = rtnl_configure_link(dev, NULL); 4133 if (err < 0) { 4134 LIST_HEAD(list_kill); 4135 4136 vxlan_dellink(dev, &list_kill); 4137 unregister_netdevice_many(&list_kill); 4138 return ERR_PTR(err); 4139 } 4140 4141 return dev; 4142} 4143EXPORT_SYMBOL_GPL(vxlan_dev_create); 4144 4145static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn, 4146 struct net_device *dev) 4147{ 4148 struct vxlan_dev *vxlan, *next; 4149 LIST_HEAD(list_kill); 4150 4151 list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { 4152 struct vxlan_rdst *dst = &vxlan->default_dst; 4153 4154 /* In case we created vxlan device with carrier 4155 * and we loose the carrier due to module unload 4156 * we also need to remove vxlan device. In other 4157 * cases, it's not necessary and remote_ifindex 4158 * is 0 here, so no matches. 4159 */ 4160 if (dst->remote_ifindex == dev->ifindex) 4161 vxlan_dellink(vxlan->dev, &list_kill); 4162 } 4163 4164 unregister_netdevice_many(&list_kill); 4165} 4166 4167static int vxlan_netdevice_event(struct notifier_block *unused, 4168 unsigned long event, void *ptr) 4169{ 4170 struct net_device *dev = netdev_notifier_info_to_dev(ptr); 4171 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); 4172 4173 if (event == NETDEV_UNREGISTER) { 4174 vxlan_offload_rx_ports(dev, false); 4175 vxlan_handle_lowerdev_unregister(vn, dev); 4176 } else if (event == NETDEV_REGISTER) { 4177 vxlan_offload_rx_ports(dev, true); 4178 } else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO || 4179 event == NETDEV_UDP_TUNNEL_DROP_INFO) { 4180 vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO); 4181 } 4182 4183 return NOTIFY_DONE; 4184} 4185 4186static struct notifier_block vxlan_notifier_block __read_mostly = { 4187 .notifier_call = vxlan_netdevice_event, 4188}; 4189 4190static void 4191vxlan_fdb_offloaded_set(struct net_device *dev, 4192 struct switchdev_notifier_vxlan_fdb_info *fdb_info) 4193{ 4194 struct vxlan_dev *vxlan = netdev_priv(dev); 4195 struct vxlan_rdst *rdst; 4196 struct vxlan_fdb *f; 4197 4198 spin_lock_bh(&vxlan->hash_lock); 4199 4200 f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); 4201 if (!f) 4202 goto out; 4203 4204 rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip, 4205 fdb_info->remote_port, 4206 fdb_info->remote_vni, 4207 fdb_info->remote_ifindex); 4208 if (!rdst) 4209 goto out; 4210 4211 rdst->offloaded = fdb_info->offloaded; 4212 4213out: 4214 spin_unlock_bh(&vxlan->hash_lock); 4215} 4216 4217static int 4218vxlan_fdb_external_learn_add(struct net_device *dev, 4219 struct switchdev_notifier_vxlan_fdb_info *fdb_info) 4220{ 4221 struct vxlan_dev *vxlan = netdev_priv(dev); 4222 struct netlink_ext_ack *extack; 4223 int err; 4224 4225 extack = switchdev_notifier_info_to_extack(&fdb_info->info); 4226 4227 spin_lock_bh(&vxlan->hash_lock); 4228 err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip, 4229 NUD_REACHABLE, 4230 NLM_F_CREATE | NLM_F_REPLACE, 4231 fdb_info->remote_port, 4232 fdb_info->vni, 4233 fdb_info->remote_vni, 4234 fdb_info->remote_ifindex, 4235 NTF_USE | NTF_SELF | NTF_EXT_LEARNED, 4236 false, extack); 4237 spin_unlock_bh(&vxlan->hash_lock); 4238 4239 return err; 4240} 4241 4242static int 4243vxlan_fdb_external_learn_del(struct net_device *dev, 4244 struct switchdev_notifier_vxlan_fdb_info *fdb_info) 4245{ 4246 struct vxlan_dev *vxlan = netdev_priv(dev); 4247 struct vxlan_fdb *f; 4248 int err = 0; 4249 4250 spin_lock_bh(&vxlan->hash_lock); 4251 4252 f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni); 4253 if (!f) 4254 err = -ENOENT; 4255 else if (f->flags & NTF_EXT_LEARNED) 4256 err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr, 4257 fdb_info->remote_ip, 4258 fdb_info->remote_port, 4259 fdb_info->vni, 4260 fdb_info->remote_vni, 4261 fdb_info->remote_ifindex, 4262 false); 4263 4264 spin_unlock_bh(&vxlan->hash_lock); 4265 4266 return err; 4267} 4268 4269static int vxlan_switchdev_event(struct notifier_block *unused, 4270 unsigned long event, void *ptr) 4271{ 4272 struct net_device *dev = switchdev_notifier_info_to_dev(ptr); 4273 struct switchdev_notifier_vxlan_fdb_info *fdb_info; 4274 int err = 0; 4275 4276 switch (event) { 4277 case SWITCHDEV_VXLAN_FDB_OFFLOADED: 4278 vxlan_fdb_offloaded_set(dev, ptr); 4279 break; 4280 case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE: 4281 fdb_info = ptr; 4282 err = vxlan_fdb_external_learn_add(dev, fdb_info); 4283 if (err) { 4284 err = notifier_from_errno(err); 4285 break; 4286 } 4287 fdb_info->offloaded = true; 4288 vxlan_fdb_offloaded_set(dev, fdb_info); 4289 break; 4290 case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE: 4291 fdb_info = ptr; 4292 err = vxlan_fdb_external_learn_del(dev, fdb_info); 4293 if (err) { 4294 err = notifier_from_errno(err); 4295 break; 4296 } 4297 fdb_info->offloaded = false; 4298 vxlan_fdb_offloaded_set(dev, fdb_info); 4299 break; 4300 } 4301 4302 return err; 4303} 4304 4305static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = { 4306 .notifier_call = vxlan_switchdev_event, 4307}; 4308 4309static __net_init int vxlan_init_net(struct net *net) 4310{ 4311 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 4312 unsigned int h; 4313 4314 INIT_LIST_HEAD(&vn->vxlan_list); 4315 spin_lock_init(&vn->sock_lock); 4316 4317 for (h = 0; h < PORT_HASH_SIZE; ++h) 4318 INIT_HLIST_HEAD(&vn->sock_list[h]); 4319 4320 return 0; 4321} 4322 4323static void vxlan_destroy_tunnels(struct net *net, struct list_head *head) 4324{ 4325 struct vxlan_net *vn = net_generic(net, vxlan_net_id); 4326 struct vxlan_dev *vxlan, *next; 4327 struct net_device *dev, *aux; 4328 unsigned int h; 4329 4330 for_each_netdev_safe(net, dev, aux) 4331 if (dev->rtnl_link_ops == &vxlan_link_ops) 4332 unregister_netdevice_queue(dev, head); 4333 4334 list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) { 4335 /* If vxlan->dev is in the same netns, it has already been added 4336 * to the list by the previous loop. 4337 */ 4338 if (!net_eq(dev_net(vxlan->dev), net)) { 4339 gro_cells_destroy(&vxlan->gro_cells); 4340 unregister_netdevice_queue(vxlan->dev, head); 4341 } 4342 } 4343 4344 for (h = 0; h < PORT_HASH_SIZE; ++h) 4345 WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h])); 4346} 4347 4348static void __net_exit vxlan_exit_batch_net(struct list_head *net_list) 4349{ 4350 struct net *net; 4351 LIST_HEAD(list); 4352 4353 rtnl_lock(); 4354 list_for_each_entry(net, net_list, exit_list) 4355 vxlan_destroy_tunnels(net, &list); 4356 4357 unregister_netdevice_many(&list); 4358 rtnl_unlock(); 4359} 4360 4361static struct pernet_operations vxlan_net_ops = { 4362 .init = vxlan_init_net, 4363 .exit_batch = vxlan_exit_batch_net, 4364 .id = &vxlan_net_id, 4365 .size = sizeof(struct vxlan_net), 4366}; 4367 4368static int __init vxlan_init_module(void) 4369{ 4370 int rc; 4371 4372 get_random_bytes(&vxlan_salt, sizeof(vxlan_salt)); 4373 4374 rc = register_pernet_subsys(&vxlan_net_ops); 4375 if (rc) 4376 goto out1; 4377 4378 rc = register_netdevice_notifier(&vxlan_notifier_block); 4379 if (rc) 4380 goto out2; 4381 4382 rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block); 4383 if (rc) 4384 goto out3; 4385 4386 rc = rtnl_link_register(&vxlan_link_ops); 4387 if (rc) 4388 goto out4; 4389 4390 return 0; 4391out4: 4392 unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); 4393out3: 4394 unregister_netdevice_notifier(&vxlan_notifier_block); 4395out2: 4396 unregister_pernet_subsys(&vxlan_net_ops); 4397out1: 4398 return rc; 4399} 4400late_initcall(vxlan_init_module); 4401 4402static void __exit vxlan_cleanup_module(void) 4403{ 4404 rtnl_link_unregister(&vxlan_link_ops); 4405 unregister_switchdev_notifier(&vxlan_switchdev_notifier_block); 4406 unregister_netdevice_notifier(&vxlan_notifier_block); 4407 unregister_pernet_subsys(&vxlan_net_ops); 4408 /* rcu_barrier() is called by netns */ 4409} 4410module_exit(vxlan_cleanup_module); 4411 4412MODULE_LICENSE("GPL"); 4413MODULE_VERSION(VXLAN_VERSION); 4414MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>"); 4415MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic"); 4416MODULE_ALIAS_RTNL_LINK("vxlan");