drivers/net/vrf.c at v5.2 · tjh.dev/kernel

tjh.dev / kernel
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
kernel / drivers / net / vrf.c
at v5.2 1471 lines 34 kB view raw
   1// SPDX-License-Identifier: GPL-2.0-or-later
   2/*
   3 * vrf.c: device driver to encapsulate a VRF space
   4 *
   5 * Copyright (c) 2015 Cumulus Networks. All rights reserved.
   6 * Copyright (c) 2015 Shrijeet Mukherjee <shm@cumulusnetworks.com>
   7 * Copyright (c) 2015 David Ahern <dsa@cumulusnetworks.com>
   8 *
   9 * Based on dummy, team and ipvlan drivers
  10 */
  11
  12#include <linux/module.h>
  13#include <linux/kernel.h>
  14#include <linux/netdevice.h>
  15#include <linux/etherdevice.h>
  16#include <linux/ip.h>
  17#include <linux/init.h>
  18#include <linux/moduleparam.h>
  19#include <linux/netfilter.h>
  20#include <linux/rtnetlink.h>
  21#include <net/rtnetlink.h>
  22#include <linux/u64_stats_sync.h>
  23#include <linux/hashtable.h>
  24
  25#include <linux/inetdevice.h>
  26#include <net/arp.h>
  27#include <net/ip.h>
  28#include <net/ip_fib.h>
  29#include <net/ip6_fib.h>
  30#include <net/ip6_route.h>
  31#include <net/route.h>
  32#include <net/addrconf.h>
  33#include <net/l3mdev.h>
  34#include <net/fib_rules.h>
  35#include <net/netns/generic.h>
  36
  37#define DRV_NAME	"vrf"
  38#define DRV_VERSION	"1.0"
  39
  40#define FIB_RULE_PREF  1000       /* default preference for FIB rules */
  41
  42static unsigned int vrf_net_id;
  43
  44struct net_vrf {
  45	struct rtable __rcu	*rth;
  46	struct rt6_info	__rcu	*rt6;
  47#if IS_ENABLED(CONFIG_IPV6)
  48	struct fib6_table	*fib6_table;
  49#endif
  50	u32                     tb_id;
  51};
  52
  53struct pcpu_dstats {
  54	u64			tx_pkts;
  55	u64			tx_bytes;
  56	u64			tx_drps;
  57	u64			rx_pkts;
  58	u64			rx_bytes;
  59	u64			rx_drps;
  60	struct u64_stats_sync	syncp;
  61};
  62
  63static void vrf_rx_stats(struct net_device *dev, int len)
  64{
  65	struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
  66
  67	u64_stats_update_begin(&dstats->syncp);
  68	dstats->rx_pkts++;
  69	dstats->rx_bytes += len;
  70	u64_stats_update_end(&dstats->syncp);
  71}
  72
  73static void vrf_tx_error(struct net_device *vrf_dev, struct sk_buff *skb)
  74{
  75	vrf_dev->stats.tx_errors++;
  76	kfree_skb(skb);
  77}
  78
  79static void vrf_get_stats64(struct net_device *dev,
  80			    struct rtnl_link_stats64 *stats)
  81{
  82	int i;
  83
  84	for_each_possible_cpu(i) {
  85		const struct pcpu_dstats *dstats;
  86		u64 tbytes, tpkts, tdrops, rbytes, rpkts;
  87		unsigned int start;
  88
  89		dstats = per_cpu_ptr(dev->dstats, i);
  90		do {
  91			start = u64_stats_fetch_begin_irq(&dstats->syncp);
  92			tbytes = dstats->tx_bytes;
  93			tpkts = dstats->tx_pkts;
  94			tdrops = dstats->tx_drps;
  95			rbytes = dstats->rx_bytes;
  96			rpkts = dstats->rx_pkts;
  97		} while (u64_stats_fetch_retry_irq(&dstats->syncp, start));
  98		stats->tx_bytes += tbytes;
  99		stats->tx_packets += tpkts;
 100		stats->tx_dropped += tdrops;
 101		stats->rx_bytes += rbytes;
 102		stats->rx_packets += rpkts;
 103	}
 104}
 105
 106/* by default VRF devices do not have a qdisc and are expected
 107 * to be created with only a single queue.
 108 */
 109static bool qdisc_tx_is_default(const struct net_device *dev)
 110{
 111	struct netdev_queue *txq;
 112	struct Qdisc *qdisc;
 113
 114	if (dev->num_tx_queues > 1)
 115		return false;
 116
 117	txq = netdev_get_tx_queue(dev, 0);
 118	qdisc = rcu_access_pointer(txq->qdisc);
 119
 120	return !qdisc->enqueue;
 121}
 122
 123/* Local traffic destined to local address. Reinsert the packet to rx
 124 * path, similar to loopback handling.
 125 */
 126static int vrf_local_xmit(struct sk_buff *skb, struct net_device *dev,
 127			  struct dst_entry *dst)
 128{
 129	int len = skb->len;
 130
 131	skb_orphan(skb);
 132
 133	skb_dst_set(skb, dst);
 134
 135	/* set pkt_type to avoid skb hitting packet taps twice -
 136	 * once on Tx and again in Rx processing
 137	 */
 138	skb->pkt_type = PACKET_LOOPBACK;
 139
 140	skb->protocol = eth_type_trans(skb, dev);
 141
 142	if (likely(netif_rx(skb) == NET_RX_SUCCESS))
 143		vrf_rx_stats(dev, len);
 144	else
 145		this_cpu_inc(dev->dstats->rx_drps);
 146
 147	return NETDEV_TX_OK;
 148}
 149
 150#if IS_ENABLED(CONFIG_IPV6)
 151static int vrf_ip6_local_out(struct net *net, struct sock *sk,
 152			     struct sk_buff *skb)
 153{
 154	int err;
 155
 156	err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net,
 157		      sk, skb, NULL, skb_dst(skb)->dev, dst_output);
 158
 159	if (likely(err == 1))
 160		err = dst_output(net, sk, skb);
 161
 162	return err;
 163}
 164
 165static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 166					   struct net_device *dev)
 167{
 168	const struct ipv6hdr *iph = ipv6_hdr(skb);
 169	struct net *net = dev_net(skb->dev);
 170	struct flowi6 fl6 = {
 171		/* needed to match OIF rule */
 172		.flowi6_oif = dev->ifindex,
 173		.flowi6_iif = LOOPBACK_IFINDEX,
 174		.daddr = iph->daddr,
 175		.saddr = iph->saddr,
 176		.flowlabel = ip6_flowinfo(iph),
 177		.flowi6_mark = skb->mark,
 178		.flowi6_proto = iph->nexthdr,
 179		.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF,
 180	};
 181	int ret = NET_XMIT_DROP;
 182	struct dst_entry *dst;
 183	struct dst_entry *dst_null = &net->ipv6.ip6_null_entry->dst;
 184
 185	dst = ip6_route_output(net, NULL, &fl6);
 186	if (dst == dst_null)
 187		goto err;
 188
 189	skb_dst_drop(skb);
 190
 191	/* if dst.dev is loopback or the VRF device again this is locally
 192	 * originated traffic destined to a local address. Short circuit
 193	 * to Rx path
 194	 */
 195	if (dst->dev == dev)
 196		return vrf_local_xmit(skb, dev, dst);
 197
 198	skb_dst_set(skb, dst);
 199
 200	/* strip the ethernet header added for pass through VRF device */
 201	__skb_pull(skb, skb_network_offset(skb));
 202
 203	ret = vrf_ip6_local_out(net, skb->sk, skb);
 204	if (unlikely(net_xmit_eval(ret)))
 205		dev->stats.tx_errors++;
 206	else
 207		ret = NET_XMIT_SUCCESS;
 208
 209	return ret;
 210err:
 211	vrf_tx_error(dev, skb);
 212	return NET_XMIT_DROP;
 213}
 214#else
 215static netdev_tx_t vrf_process_v6_outbound(struct sk_buff *skb,
 216					   struct net_device *dev)
 217{
 218	vrf_tx_error(dev, skb);
 219	return NET_XMIT_DROP;
 220}
 221#endif
 222
 223/* based on ip_local_out; can't use it b/c the dst is switched pointing to us */
 224static int vrf_ip_local_out(struct net *net, struct sock *sk,
 225			    struct sk_buff *skb)
 226{
 227	int err;
 228
 229	err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
 230		      skb, NULL, skb_dst(skb)->dev, dst_output);
 231	if (likely(err == 1))
 232		err = dst_output(net, sk, skb);
 233
 234	return err;
 235}
 236
 237static netdev_tx_t vrf_process_v4_outbound(struct sk_buff *skb,
 238					   struct net_device *vrf_dev)
 239{
 240	struct iphdr *ip4h = ip_hdr(skb);
 241	int ret = NET_XMIT_DROP;
 242	struct flowi4 fl4 = {
 243		/* needed to match OIF rule */
 244		.flowi4_oif = vrf_dev->ifindex,
 245		.flowi4_iif = LOOPBACK_IFINDEX,
 246		.flowi4_tos = RT_TOS(ip4h->tos),
 247		.flowi4_flags = FLOWI_FLAG_ANYSRC | FLOWI_FLAG_SKIP_NH_OIF,
 248		.flowi4_proto = ip4h->protocol,
 249		.daddr = ip4h->daddr,
 250		.saddr = ip4h->saddr,
 251	};
 252	struct net *net = dev_net(vrf_dev);
 253	struct rtable *rt;
 254
 255	rt = ip_route_output_flow(net, &fl4, NULL);
 256	if (IS_ERR(rt))
 257		goto err;
 258
 259	skb_dst_drop(skb);
 260
 261	/* if dst.dev is loopback or the VRF device again this is locally
 262	 * originated traffic destined to a local address. Short circuit
 263	 * to Rx path
 264	 */
 265	if (rt->dst.dev == vrf_dev)
 266		return vrf_local_xmit(skb, vrf_dev, &rt->dst);
 267
 268	skb_dst_set(skb, &rt->dst);
 269
 270	/* strip the ethernet header added for pass through VRF device */
 271	__skb_pull(skb, skb_network_offset(skb));
 272
 273	if (!ip4h->saddr) {
 274		ip4h->saddr = inet_select_addr(skb_dst(skb)->dev, 0,
 275					       RT_SCOPE_LINK);
 276	}
 277
 278	ret = vrf_ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
 279	if (unlikely(net_xmit_eval(ret)))
 280		vrf_dev->stats.tx_errors++;
 281	else
 282		ret = NET_XMIT_SUCCESS;
 283
 284out:
 285	return ret;
 286err:
 287	vrf_tx_error(vrf_dev, skb);
 288	goto out;
 289}
 290
 291static netdev_tx_t is_ip_tx_frame(struct sk_buff *skb, struct net_device *dev)
 292{
 293	switch (skb->protocol) {
 294	case htons(ETH_P_IP):
 295		return vrf_process_v4_outbound(skb, dev);
 296	case htons(ETH_P_IPV6):
 297		return vrf_process_v6_outbound(skb, dev);
 298	default:
 299		vrf_tx_error(dev, skb);
 300		return NET_XMIT_DROP;
 301	}
 302}
 303
 304static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev)
 305{
 306	int len = skb->len;
 307	netdev_tx_t ret = is_ip_tx_frame(skb, dev);
 308
 309	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
 310		struct pcpu_dstats *dstats = this_cpu_ptr(dev->dstats);
 311
 312		u64_stats_update_begin(&dstats->syncp);
 313		dstats->tx_pkts++;
 314		dstats->tx_bytes += len;
 315		u64_stats_update_end(&dstats->syncp);
 316	} else {
 317		this_cpu_inc(dev->dstats->tx_drps);
 318	}
 319
 320	return ret;
 321}
 322
 323static int vrf_finish_direct(struct net *net, struct sock *sk,
 324			     struct sk_buff *skb)
 325{
 326	struct net_device *vrf_dev = skb->dev;
 327
 328	if (!list_empty(&vrf_dev->ptype_all) &&
 329	    likely(skb_headroom(skb) >= ETH_HLEN)) {
 330		struct ethhdr *eth = skb_push(skb, ETH_HLEN);
 331
 332		ether_addr_copy(eth->h_source, vrf_dev->dev_addr);
 333		eth_zero_addr(eth->h_dest);
 334		eth->h_proto = skb->protocol;
 335
 336		rcu_read_lock_bh();
 337		dev_queue_xmit_nit(skb, vrf_dev);
 338		rcu_read_unlock_bh();
 339
 340		skb_pull(skb, ETH_HLEN);
 341	}
 342
 343	return 1;
 344}
 345
 346#if IS_ENABLED(CONFIG_IPV6)
 347/* modelled after ip6_finish_output2 */
 348static int vrf_finish_output6(struct net *net, struct sock *sk,
 349			      struct sk_buff *skb)
 350{
 351	struct dst_entry *dst = skb_dst(skb);
 352	struct net_device *dev = dst->dev;
 353	const struct in6_addr *nexthop;
 354	struct neighbour *neigh;
 355	int ret;
 356
 357	nf_reset(skb);
 358
 359	skb->protocol = htons(ETH_P_IPV6);
 360	skb->dev = dev;
 361
 362	rcu_read_lock_bh();
 363	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 364	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 365	if (unlikely(!neigh))
 366		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 367	if (!IS_ERR(neigh)) {
 368		sock_confirm_neigh(skb, neigh);
 369		ret = neigh_output(neigh, skb, false);
 370		rcu_read_unlock_bh();
 371		return ret;
 372	}
 373	rcu_read_unlock_bh();
 374
 375	IP6_INC_STATS(dev_net(dst->dev),
 376		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 377	kfree_skb(skb);
 378	return -EINVAL;
 379}
 380
 381/* modelled after ip6_output */
 382static int vrf_output6(struct net *net, struct sock *sk, struct sk_buff *skb)
 383{
 384	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 385			    net, sk, skb, NULL, skb_dst(skb)->dev,
 386			    vrf_finish_output6,
 387			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 388}
 389
 390/* set dst on skb to send packet to us via dev_xmit path. Allows
 391 * packet to go through device based features such as qdisc, netfilter
 392 * hooks and packet sockets with skb->dev set to vrf device.
 393 */
 394static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev,
 395					    struct sk_buff *skb)
 396{
 397	struct net_vrf *vrf = netdev_priv(vrf_dev);
 398	struct dst_entry *dst = NULL;
 399	struct rt6_info *rt6;
 400
 401	rcu_read_lock();
 402
 403	rt6 = rcu_dereference(vrf->rt6);
 404	if (likely(rt6)) {
 405		dst = &rt6->dst;
 406		dst_hold(dst);
 407	}
 408
 409	rcu_read_unlock();
 410
 411	if (unlikely(!dst)) {
 412		vrf_tx_error(vrf_dev, skb);
 413		return NULL;
 414	}
 415
 416	skb_dst_drop(skb);
 417	skb_dst_set(skb, dst);
 418
 419	return skb;
 420}
 421
 422static int vrf_output6_direct(struct net *net, struct sock *sk,
 423			      struct sk_buff *skb)
 424{
 425	skb->protocol = htons(ETH_P_IPV6);
 426
 427	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 428			    net, sk, skb, NULL, skb->dev,
 429			    vrf_finish_direct,
 430			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 431}
 432
 433static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev,
 434					  struct sock *sk,
 435					  struct sk_buff *skb)
 436{
 437	struct net *net = dev_net(vrf_dev);
 438	int err;
 439
 440	skb->dev = vrf_dev;
 441
 442	err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk,
 443		      skb, NULL, vrf_dev, vrf_output6_direct);
 444
 445	if (likely(err == 1))
 446		err = vrf_output6_direct(net, sk, skb);
 447
 448	/* reset skb device */
 449	if (likely(err == 1))
 450		nf_reset(skb);
 451	else
 452		skb = NULL;
 453
 454	return skb;
 455}
 456
 457static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
 458				   struct sock *sk,
 459				   struct sk_buff *skb)
 460{
 461	/* don't divert link scope packets */
 462	if (rt6_need_strict(&ipv6_hdr(skb)->daddr))
 463		return skb;
 464
 465	if (qdisc_tx_is_default(vrf_dev))
 466		return vrf_ip6_out_direct(vrf_dev, sk, skb);
 467
 468	return vrf_ip6_out_redirect(vrf_dev, skb);
 469}
 470
 471/* holding rtnl */
 472static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
 473{
 474	struct rt6_info *rt6 = rtnl_dereference(vrf->rt6);
 475	struct net *net = dev_net(dev);
 476	struct dst_entry *dst;
 477
 478	RCU_INIT_POINTER(vrf->rt6, NULL);
 479	synchronize_rcu();
 480
 481	/* move dev in dst's to loopback so this VRF device can be deleted
 482	 * - based on dst_ifdown
 483	 */
 484	if (rt6) {
 485		dst = &rt6->dst;
 486		dev_put(dst->dev);
 487		dst->dev = net->loopback_dev;
 488		dev_hold(dst->dev);
 489		dst_release(dst);
 490	}
 491}
 492
 493static int vrf_rt6_create(struct net_device *dev)
 494{
 495	int flags = DST_HOST | DST_NOPOLICY | DST_NOXFRM;
 496	struct net_vrf *vrf = netdev_priv(dev);
 497	struct net *net = dev_net(dev);
 498	struct rt6_info *rt6;
 499	int rc = -ENOMEM;
 500
 501	/* IPv6 can be CONFIG enabled and then disabled runtime */
 502	if (!ipv6_mod_enabled())
 503		return 0;
 504
 505	vrf->fib6_table = fib6_new_table(net, vrf->tb_id);
 506	if (!vrf->fib6_table)
 507		goto out;
 508
 509	/* create a dst for routing packets out a VRF device */
 510	rt6 = ip6_dst_alloc(net, dev, flags);
 511	if (!rt6)
 512		goto out;
 513
 514	rt6->dst.output	= vrf_output6;
 515
 516	rcu_assign_pointer(vrf->rt6, rt6);
 517
 518	rc = 0;
 519out:
 520	return rc;
 521}
 522#else
 523static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev,
 524				   struct sock *sk,
 525				   struct sk_buff *skb)
 526{
 527	return skb;
 528}
 529
 530static void vrf_rt6_release(struct net_device *dev, struct net_vrf *vrf)
 531{
 532}
 533
 534static int vrf_rt6_create(struct net_device *dev)
 535{
 536	return 0;
 537}
 538#endif
 539
 540/* modelled after ip_finish_output2 */
 541static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 542{
 543	struct dst_entry *dst = skb_dst(skb);
 544	struct rtable *rt = (struct rtable *)dst;
 545	struct net_device *dev = dst->dev;
 546	unsigned int hh_len = LL_RESERVED_SPACE(dev);
 547	struct neighbour *neigh;
 548	bool is_v6gw = false;
 549	int ret = -EINVAL;
 550
 551	nf_reset(skb);
 552
 553	/* Be paranoid, rather than too clever. */
 554	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 555		struct sk_buff *skb2;
 556
 557		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 558		if (!skb2) {
 559			ret = -ENOMEM;
 560			goto err;
 561		}
 562		if (skb->sk)
 563			skb_set_owner_w(skb2, skb->sk);
 564
 565		consume_skb(skb);
 566		skb = skb2;
 567	}
 568
 569	rcu_read_lock_bh();
 570
 571	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
 572	if (!IS_ERR(neigh)) {
 573		sock_confirm_neigh(skb, neigh);
 574		/* if crossing protocols, can not use the cached header */
 575		ret = neigh_output(neigh, skb, is_v6gw);
 576		rcu_read_unlock_bh();
 577		return ret;
 578	}
 579
 580	rcu_read_unlock_bh();
 581err:
 582	vrf_tx_error(skb->dev, skb);
 583	return ret;
 584}
 585
 586static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 587{
 588	struct net_device *dev = skb_dst(skb)->dev;
 589
 590	IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
 591
 592	skb->dev = dev;
 593	skb->protocol = htons(ETH_P_IP);
 594
 595	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 596			    net, sk, skb, NULL, dev,
 597			    vrf_finish_output,
 598			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 599}
 600
 601/* set dst on skb to send packet to us via dev_xmit path. Allows
 602 * packet to go through device based features such as qdisc, netfilter
 603 * hooks and packet sockets with skb->dev set to vrf device.
 604 */
 605static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev,
 606					   struct sk_buff *skb)
 607{
 608	struct net_vrf *vrf = netdev_priv(vrf_dev);
 609	struct dst_entry *dst = NULL;
 610	struct rtable *rth;
 611
 612	rcu_read_lock();
 613
 614	rth = rcu_dereference(vrf->rth);
 615	if (likely(rth)) {
 616		dst = &rth->dst;
 617		dst_hold(dst);
 618	}
 619
 620	rcu_read_unlock();
 621
 622	if (unlikely(!dst)) {
 623		vrf_tx_error(vrf_dev, skb);
 624		return NULL;
 625	}
 626
 627	skb_dst_drop(skb);
 628	skb_dst_set(skb, dst);
 629
 630	return skb;
 631}
 632
 633static int vrf_output_direct(struct net *net, struct sock *sk,
 634			     struct sk_buff *skb)
 635{
 636	skb->protocol = htons(ETH_P_IP);
 637
 638	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 639			    net, sk, skb, NULL, skb->dev,
 640			    vrf_finish_direct,
 641			    !(IPCB(skb)->flags & IPSKB_REROUTED));
 642}
 643
 644static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev,
 645					 struct sock *sk,
 646					 struct sk_buff *skb)
 647{
 648	struct net *net = dev_net(vrf_dev);
 649	int err;
 650
 651	skb->dev = vrf_dev;
 652
 653	err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk,
 654		      skb, NULL, vrf_dev, vrf_output_direct);
 655
 656	if (likely(err == 1))
 657		err = vrf_output_direct(net, sk, skb);
 658
 659	/* reset skb device */
 660	if (likely(err == 1))
 661		nf_reset(skb);
 662	else
 663		skb = NULL;
 664
 665	return skb;
 666}
 667
 668static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev,
 669				  struct sock *sk,
 670				  struct sk_buff *skb)
 671{
 672	/* don't divert multicast or local broadcast */
 673	if (ipv4_is_multicast(ip_hdr(skb)->daddr) ||
 674	    ipv4_is_lbcast(ip_hdr(skb)->daddr))
 675		return skb;
 676
 677	if (qdisc_tx_is_default(vrf_dev))
 678		return vrf_ip_out_direct(vrf_dev, sk, skb);
 679
 680	return vrf_ip_out_redirect(vrf_dev, skb);
 681}
 682
 683/* called with rcu lock held */
 684static struct sk_buff *vrf_l3_out(struct net_device *vrf_dev,
 685				  struct sock *sk,
 686				  struct sk_buff *skb,
 687				  u16 proto)
 688{
 689	switch (proto) {
 690	case AF_INET:
 691		return vrf_ip_out(vrf_dev, sk, skb);
 692	case AF_INET6:
 693		return vrf_ip6_out(vrf_dev, sk, skb);
 694	}
 695
 696	return skb;
 697}
 698
 699/* holding rtnl */
 700static void vrf_rtable_release(struct net_device *dev, struct net_vrf *vrf)
 701{
 702	struct rtable *rth = rtnl_dereference(vrf->rth);
 703	struct net *net = dev_net(dev);
 704	struct dst_entry *dst;
 705
 706	RCU_INIT_POINTER(vrf->rth, NULL);
 707	synchronize_rcu();
 708
 709	/* move dev in dst's to loopback so this VRF device can be deleted
 710	 * - based on dst_ifdown
 711	 */
 712	if (rth) {
 713		dst = &rth->dst;
 714		dev_put(dst->dev);
 715		dst->dev = net->loopback_dev;
 716		dev_hold(dst->dev);
 717		dst_release(dst);
 718	}
 719}
 720
 721static int vrf_rtable_create(struct net_device *dev)
 722{
 723	struct net_vrf *vrf = netdev_priv(dev);
 724	struct rtable *rth;
 725
 726	if (!fib_new_table(dev_net(dev), vrf->tb_id))
 727		return -ENOMEM;
 728
 729	/* create a dst for routing packets out through a VRF device */
 730	rth = rt_dst_alloc(dev, 0, RTN_UNICAST, 1, 1, 0);
 731	if (!rth)
 732		return -ENOMEM;
 733
 734	rth->dst.output	= vrf_output;
 735
 736	rcu_assign_pointer(vrf->rth, rth);
 737
 738	return 0;
 739}
 740
 741/**************************** device handling ********************/
 742
 743/* cycle interface to flush neighbor cache and move routes across tables */
 744static void cycle_netdev(struct net_device *dev,
 745			 struct netlink_ext_ack *extack)
 746{
 747	unsigned int flags = dev->flags;
 748	int ret;
 749
 750	if (!netif_running(dev))
 751		return;
 752
 753	ret = dev_change_flags(dev, flags & ~IFF_UP, extack);
 754	if (ret >= 0)
 755		ret = dev_change_flags(dev, flags, extack);
 756
 757	if (ret < 0) {
 758		netdev_err(dev,
 759			   "Failed to cycle device %s; route tables might be wrong!\n",
 760			   dev->name);
 761	}
 762}
 763
 764static int do_vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
 765			    struct netlink_ext_ack *extack)
 766{
 767	int ret;
 768
 769	/* do not allow loopback device to be enslaved to a VRF.
 770	 * The vrf device acts as the loopback for the vrf.
 771	 */
 772	if (port_dev == dev_net(dev)->loopback_dev) {
 773		NL_SET_ERR_MSG(extack,
 774			       "Can not enslave loopback device to a VRF");
 775		return -EOPNOTSUPP;
 776	}
 777
 778	port_dev->priv_flags |= IFF_L3MDEV_SLAVE;
 779	ret = netdev_master_upper_dev_link(port_dev, dev, NULL, NULL, extack);
 780	if (ret < 0)
 781		goto err;
 782
 783	cycle_netdev(port_dev, extack);
 784
 785	return 0;
 786
 787err:
 788	port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
 789	return ret;
 790}
 791
 792static int vrf_add_slave(struct net_device *dev, struct net_device *port_dev,
 793			 struct netlink_ext_ack *extack)
 794{
 795	if (netif_is_l3_master(port_dev)) {
 796		NL_SET_ERR_MSG(extack,
 797			       "Can not enslave an L3 master device to a VRF");
 798		return -EINVAL;
 799	}
 800
 801	if (netif_is_l3_slave(port_dev))
 802		return -EINVAL;
 803
 804	return do_vrf_add_slave(dev, port_dev, extack);
 805}
 806
 807/* inverse of do_vrf_add_slave */
 808static int do_vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
 809{
 810	netdev_upper_dev_unlink(port_dev, dev);
 811	port_dev->priv_flags &= ~IFF_L3MDEV_SLAVE;
 812
 813	cycle_netdev(port_dev, NULL);
 814
 815	return 0;
 816}
 817
 818static int vrf_del_slave(struct net_device *dev, struct net_device *port_dev)
 819{
 820	return do_vrf_del_slave(dev, port_dev);
 821}
 822
 823static void vrf_dev_uninit(struct net_device *dev)
 824{
 825	struct net_vrf *vrf = netdev_priv(dev);
 826
 827	vrf_rtable_release(dev, vrf);
 828	vrf_rt6_release(dev, vrf);
 829
 830	free_percpu(dev->dstats);
 831	dev->dstats = NULL;
 832}
 833
 834static int vrf_dev_init(struct net_device *dev)
 835{
 836	struct net_vrf *vrf = netdev_priv(dev);
 837
 838	dev->dstats = netdev_alloc_pcpu_stats(struct pcpu_dstats);
 839	if (!dev->dstats)
 840		goto out_nomem;
 841
 842	/* create the default dst which points back to us */
 843	if (vrf_rtable_create(dev) != 0)
 844		goto out_stats;
 845
 846	if (vrf_rt6_create(dev) != 0)
 847		goto out_rth;
 848
 849	dev->flags = IFF_MASTER | IFF_NOARP;
 850
 851	/* MTU is irrelevant for VRF device; set to 64k similar to lo */
 852	dev->mtu = 64 * 1024;
 853
 854	/* similarly, oper state is irrelevant; set to up to avoid confusion */
 855	dev->operstate = IF_OPER_UP;
 856	netdev_lockdep_set_classes(dev);
 857	return 0;
 858
 859out_rth:
 860	vrf_rtable_release(dev, vrf);
 861out_stats:
 862	free_percpu(dev->dstats);
 863	dev->dstats = NULL;
 864out_nomem:
 865	return -ENOMEM;
 866}
 867
 868static const struct net_device_ops vrf_netdev_ops = {
 869	.ndo_init		= vrf_dev_init,
 870	.ndo_uninit		= vrf_dev_uninit,
 871	.ndo_start_xmit		= vrf_xmit,
 872	.ndo_set_mac_address	= eth_mac_addr,
 873	.ndo_get_stats64	= vrf_get_stats64,
 874	.ndo_add_slave		= vrf_add_slave,
 875	.ndo_del_slave		= vrf_del_slave,
 876};
 877
 878static u32 vrf_fib_table(const struct net_device *dev)
 879{
 880	struct net_vrf *vrf = netdev_priv(dev);
 881
 882	return vrf->tb_id;
 883}
 884
 885static int vrf_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 886{
 887	kfree_skb(skb);
 888	return 0;
 889}
 890
 891static struct sk_buff *vrf_rcv_nfhook(u8 pf, unsigned int hook,
 892				      struct sk_buff *skb,
 893				      struct net_device *dev)
 894{
 895	struct net *net = dev_net(dev);
 896
 897	if (nf_hook(pf, hook, net, NULL, skb, dev, NULL, vrf_rcv_finish) != 1)
 898		skb = NULL;    /* kfree_skb(skb) handled by nf code */
 899
 900	return skb;
 901}
 902
 903#if IS_ENABLED(CONFIG_IPV6)
 904/* neighbor handling is done with actual device; do not want
 905 * to flip skb->dev for those ndisc packets. This really fails
 906 * for multiple next protocols (e.g., NEXTHDR_HOP). But it is
 907 * a start.
 908 */
 909static bool ipv6_ndisc_frame(const struct sk_buff *skb)
 910{
 911	const struct ipv6hdr *iph = ipv6_hdr(skb);
 912	bool rc = false;
 913
 914	if (iph->nexthdr == NEXTHDR_ICMP) {
 915		const struct icmp6hdr *icmph;
 916		struct icmp6hdr _icmph;
 917
 918		icmph = skb_header_pointer(skb, sizeof(*iph),
 919					   sizeof(_icmph), &_icmph);
 920		if (!icmph)
 921			goto out;
 922
 923		switch (icmph->icmp6_type) {
 924		case NDISC_ROUTER_SOLICITATION:
 925		case NDISC_ROUTER_ADVERTISEMENT:
 926		case NDISC_NEIGHBOUR_SOLICITATION:
 927		case NDISC_NEIGHBOUR_ADVERTISEMENT:
 928		case NDISC_REDIRECT:
 929			rc = true;
 930			break;
 931		}
 932	}
 933
 934out:
 935	return rc;
 936}
 937
 938static struct rt6_info *vrf_ip6_route_lookup(struct net *net,
 939					     const struct net_device *dev,
 940					     struct flowi6 *fl6,
 941					     int ifindex,
 942					     const struct sk_buff *skb,
 943					     int flags)
 944{
 945	struct net_vrf *vrf = netdev_priv(dev);
 946
 947	return ip6_pol_route(net, vrf->fib6_table, ifindex, fl6, skb, flags);
 948}
 949
 950static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev,
 951			      int ifindex)
 952{
 953	const struct ipv6hdr *iph = ipv6_hdr(skb);
 954	struct flowi6 fl6 = {
 955		.flowi6_iif     = ifindex,
 956		.flowi6_mark    = skb->mark,
 957		.flowi6_proto   = iph->nexthdr,
 958		.daddr          = iph->daddr,
 959		.saddr          = iph->saddr,
 960		.flowlabel      = ip6_flowinfo(iph),
 961	};
 962	struct net *net = dev_net(vrf_dev);
 963	struct rt6_info *rt6;
 964
 965	rt6 = vrf_ip6_route_lookup(net, vrf_dev, &fl6, ifindex, skb,
 966				   RT6_LOOKUP_F_HAS_SADDR | RT6_LOOKUP_F_IFACE);
 967	if (unlikely(!rt6))
 968		return;
 969
 970	if (unlikely(&rt6->dst == &net->ipv6.ip6_null_entry->dst))
 971		return;
 972
 973	skb_dst_set(skb, &rt6->dst);
 974}
 975
 976static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
 977				   struct sk_buff *skb)
 978{
 979	int orig_iif = skb->skb_iif;
 980	bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr);
 981	bool is_ndisc = ipv6_ndisc_frame(skb);
 982
 983	/* loopback, multicast & non-ND link-local traffic; do not push through
 984	 * packet taps again. Reset pkt_type for upper layers to process skb
 985	 */
 986	if (skb->pkt_type == PACKET_LOOPBACK || (need_strict && !is_ndisc)) {
 987		skb->dev = vrf_dev;
 988		skb->skb_iif = vrf_dev->ifindex;
 989		IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
 990		if (skb->pkt_type == PACKET_LOOPBACK)
 991			skb->pkt_type = PACKET_HOST;
 992		goto out;
 993	}
 994
 995	/* if packet is NDISC then keep the ingress interface */
 996	if (!is_ndisc) {
 997		vrf_rx_stats(vrf_dev, skb->len);
 998		skb->dev = vrf_dev;
 999		skb->skb_iif = vrf_dev->ifindex;
1000
1001		if (!list_empty(&vrf_dev->ptype_all)) {
1002			skb_push(skb, skb->mac_len);
1003			dev_queue_xmit_nit(skb, vrf_dev);
1004			skb_pull(skb, skb->mac_len);
1005		}
1006
1007		IP6CB(skb)->flags |= IP6SKB_L3SLAVE;
1008	}
1009
1010	if (need_strict)
1011		vrf_ip6_input_dst(skb, vrf_dev, orig_iif);
1012
1013	skb = vrf_rcv_nfhook(NFPROTO_IPV6, NF_INET_PRE_ROUTING, skb, vrf_dev);
1014out:
1015	return skb;
1016}
1017
1018#else
1019static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev,
1020				   struct sk_buff *skb)
1021{
1022	return skb;
1023}
1024#endif
1025
1026static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev,
1027				  struct sk_buff *skb)
1028{
1029	skb->dev = vrf_dev;
1030	skb->skb_iif = vrf_dev->ifindex;
1031	IPCB(skb)->flags |= IPSKB_L3SLAVE;
1032
1033	if (ipv4_is_multicast(ip_hdr(skb)->daddr))
1034		goto out;
1035
1036	/* loopback traffic; do not push through packet taps again.
1037	 * Reset pkt_type for upper layers to process skb
1038	 */
1039	if (skb->pkt_type == PACKET_LOOPBACK) {
1040		skb->pkt_type = PACKET_HOST;
1041		goto out;
1042	}
1043
1044	vrf_rx_stats(vrf_dev, skb->len);
1045
1046	if (!list_empty(&vrf_dev->ptype_all)) {
1047		skb_push(skb, skb->mac_len);
1048		dev_queue_xmit_nit(skb, vrf_dev);
1049		skb_pull(skb, skb->mac_len);
1050	}
1051
1052	skb = vrf_rcv_nfhook(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, vrf_dev);
1053out:
1054	return skb;
1055}
1056
1057/* called with rcu lock held */
1058static struct sk_buff *vrf_l3_rcv(struct net_device *vrf_dev,
1059				  struct sk_buff *skb,
1060				  u16 proto)
1061{
1062	switch (proto) {
1063	case AF_INET:
1064		return vrf_ip_rcv(vrf_dev, skb);
1065	case AF_INET6:
1066		return vrf_ip6_rcv(vrf_dev, skb);
1067	}
1068
1069	return skb;
1070}
1071
1072#if IS_ENABLED(CONFIG_IPV6)
1073/* send to link-local or multicast address via interface enslaved to
1074 * VRF device. Force lookup to VRF table without changing flow struct
1075 */
1076static struct dst_entry *vrf_link_scope_lookup(const struct net_device *dev,
1077					      struct flowi6 *fl6)
1078{
1079	struct net *net = dev_net(dev);
1080	int flags = RT6_LOOKUP_F_IFACE;
1081	struct dst_entry *dst = NULL;
1082	struct rt6_info *rt;
1083
1084	/* VRF device does not have a link-local address and
1085	 * sending packets to link-local or mcast addresses over
1086	 * a VRF device does not make sense
1087	 */
1088	if (fl6->flowi6_oif == dev->ifindex) {
1089		dst = &net->ipv6.ip6_null_entry->dst;
1090		dst_hold(dst);
1091		return dst;
1092	}
1093
1094	if (!ipv6_addr_any(&fl6->saddr))
1095		flags |= RT6_LOOKUP_F_HAS_SADDR;
1096
1097	rt = vrf_ip6_route_lookup(net, dev, fl6, fl6->flowi6_oif, NULL, flags);
1098	if (rt)
1099		dst = &rt->dst;
1100
1101	return dst;
1102}
1103#endif
1104
1105static const struct l3mdev_ops vrf_l3mdev_ops = {
1106	.l3mdev_fib_table	= vrf_fib_table,
1107	.l3mdev_l3_rcv		= vrf_l3_rcv,
1108	.l3mdev_l3_out		= vrf_l3_out,
1109#if IS_ENABLED(CONFIG_IPV6)
1110	.l3mdev_link_scope_lookup = vrf_link_scope_lookup,
1111#endif
1112};
1113
1114static void vrf_get_drvinfo(struct net_device *dev,
1115			    struct ethtool_drvinfo *info)
1116{
1117	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
1118	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
1119}
1120
1121static const struct ethtool_ops vrf_ethtool_ops = {
1122	.get_drvinfo	= vrf_get_drvinfo,
1123};
1124
1125static inline size_t vrf_fib_rule_nl_size(void)
1126{
1127	size_t sz;
1128
1129	sz  = NLMSG_ALIGN(sizeof(struct fib_rule_hdr));
1130	sz += nla_total_size(sizeof(u8));	/* FRA_L3MDEV */
1131	sz += nla_total_size(sizeof(u32));	/* FRA_PRIORITY */
1132	sz += nla_total_size(sizeof(u8));       /* FRA_PROTOCOL */
1133
1134	return sz;
1135}
1136
1137static int vrf_fib_rule(const struct net_device *dev, __u8 family, bool add_it)
1138{
1139	struct fib_rule_hdr *frh;
1140	struct nlmsghdr *nlh;
1141	struct sk_buff *skb;
1142	int err;
1143
1144	if (family == AF_INET6 && !ipv6_mod_enabled())
1145		return 0;
1146
1147	skb = nlmsg_new(vrf_fib_rule_nl_size(), GFP_KERNEL);
1148	if (!skb)
1149		return -ENOMEM;
1150
1151	nlh = nlmsg_put(skb, 0, 0, 0, sizeof(*frh), 0);
1152	if (!nlh)
1153		goto nla_put_failure;
1154
1155	/* rule only needs to appear once */
1156	nlh->nlmsg_flags |= NLM_F_EXCL;
1157
1158	frh = nlmsg_data(nlh);
1159	memset(frh, 0, sizeof(*frh));
1160	frh->family = family;
1161	frh->action = FR_ACT_TO_TBL;
1162
1163	if (nla_put_u8(skb, FRA_PROTOCOL, RTPROT_KERNEL))
1164		goto nla_put_failure;
1165
1166	if (nla_put_u8(skb, FRA_L3MDEV, 1))
1167		goto nla_put_failure;
1168
1169	if (nla_put_u32(skb, FRA_PRIORITY, FIB_RULE_PREF))
1170		goto nla_put_failure;
1171
1172	nlmsg_end(skb, nlh);
1173
1174	/* fib_nl_{new,del}rule handling looks for net from skb->sk */
1175	skb->sk = dev_net(dev)->rtnl;
1176	if (add_it) {
1177		err = fib_nl_newrule(skb, nlh, NULL);
1178		if (err == -EEXIST)
1179			err = 0;
1180	} else {
1181		err = fib_nl_delrule(skb, nlh, NULL);
1182		if (err == -ENOENT)
1183			err = 0;
1184	}
1185	nlmsg_free(skb);
1186
1187	return err;
1188
1189nla_put_failure:
1190	nlmsg_free(skb);
1191
1192	return -EMSGSIZE;
1193}
1194
1195static int vrf_add_fib_rules(const struct net_device *dev)
1196{
1197	int err;
1198
1199	err = vrf_fib_rule(dev, AF_INET,  true);
1200	if (err < 0)
1201		goto out_err;
1202
1203	err = vrf_fib_rule(dev, AF_INET6, true);
1204	if (err < 0)
1205		goto ipv6_err;
1206
1207#if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
1208	err = vrf_fib_rule(dev, RTNL_FAMILY_IPMR, true);
1209	if (err < 0)
1210		goto ipmr_err;
1211#endif
1212
1213#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
1214	err = vrf_fib_rule(dev, RTNL_FAMILY_IP6MR, true);
1215	if (err < 0)
1216		goto ip6mr_err;
1217#endif
1218
1219	return 0;
1220
1221#if IS_ENABLED(CONFIG_IPV6_MROUTE_MULTIPLE_TABLES)
1222ip6mr_err:
1223	vrf_fib_rule(dev, RTNL_FAMILY_IPMR,  false);
1224#endif
1225
1226#if IS_ENABLED(CONFIG_IP_MROUTE_MULTIPLE_TABLES)
1227ipmr_err:
1228	vrf_fib_rule(dev, AF_INET6,  false);
1229#endif
1230
1231ipv6_err:
1232	vrf_fib_rule(dev, AF_INET,  false);
1233
1234out_err:
1235	netdev_err(dev, "Failed to add FIB rules.\n");
1236	return err;
1237}
1238
1239static void vrf_setup(struct net_device *dev)
1240{
1241	ether_setup(dev);
1242
1243	/* Initialize the device structure. */
1244	dev->netdev_ops = &vrf_netdev_ops;
1245	dev->l3mdev_ops = &vrf_l3mdev_ops;
1246	dev->ethtool_ops = &vrf_ethtool_ops;
1247	dev->needs_free_netdev = true;
1248
1249	/* Fill in device structure with ethernet-generic values. */
1250	eth_hw_addr_random(dev);
1251
1252	/* don't acquire vrf device's netif_tx_lock when transmitting */
1253	dev->features |= NETIF_F_LLTX;
1254
1255	/* don't allow vrf devices to change network namespaces. */
1256	dev->features |= NETIF_F_NETNS_LOCAL;
1257
1258	/* does not make sense for a VLAN to be added to a vrf device */
1259	dev->features   |= NETIF_F_VLAN_CHALLENGED;
1260
1261	/* enable offload features */
1262	dev->features   |= NETIF_F_GSO_SOFTWARE;
1263	dev->features   |= NETIF_F_RXCSUM | NETIF_F_HW_CSUM | NETIF_F_SCTP_CRC;
1264	dev->features   |= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA;
1265
1266	dev->hw_features = dev->features;
1267	dev->hw_enc_features = dev->features;
1268
1269	/* default to no qdisc; user can add if desired */
1270	dev->priv_flags |= IFF_NO_QUEUE;
1271	dev->priv_flags |= IFF_NO_RX_HANDLER;
1272	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
1273
1274	/* VRF devices do not care about MTU, but if the MTU is set
1275	 * too low then the ipv4 and ipv6 protocols are disabled
1276	 * which breaks networking.
1277	 */
1278	dev->min_mtu = IPV6_MIN_MTU;
1279	dev->max_mtu = ETH_MAX_MTU;
1280}
1281
1282static int vrf_validate(struct nlattr *tb[], struct nlattr *data[],
1283			struct netlink_ext_ack *extack)
1284{
1285	if (tb[IFLA_ADDRESS]) {
1286		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
1287			NL_SET_ERR_MSG(extack, "Invalid hardware address");
1288			return -EINVAL;
1289		}
1290		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
1291			NL_SET_ERR_MSG(extack, "Invalid hardware address");
1292			return -EADDRNOTAVAIL;
1293		}
1294	}
1295	return 0;
1296}
1297
1298static void vrf_dellink(struct net_device *dev, struct list_head *head)
1299{
1300	struct net_device *port_dev;
1301	struct list_head *iter;
1302
1303	netdev_for_each_lower_dev(dev, port_dev, iter)
1304		vrf_del_slave(dev, port_dev);
1305
1306	unregister_netdevice_queue(dev, head);
1307}
1308
1309static int vrf_newlink(struct net *src_net, struct net_device *dev,
1310		       struct nlattr *tb[], struct nlattr *data[],
1311		       struct netlink_ext_ack *extack)
1312{
1313	struct net_vrf *vrf = netdev_priv(dev);
1314	bool *add_fib_rules;
1315	struct net *net;
1316	int err;
1317
1318	if (!data || !data[IFLA_VRF_TABLE]) {
1319		NL_SET_ERR_MSG(extack, "VRF table id is missing");
1320		return -EINVAL;
1321	}
1322
1323	vrf->tb_id = nla_get_u32(data[IFLA_VRF_TABLE]);
1324	if (vrf->tb_id == RT_TABLE_UNSPEC) {
1325		NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VRF_TABLE],
1326				    "Invalid VRF table id");
1327		return -EINVAL;
1328	}
1329
1330	dev->priv_flags |= IFF_L3MDEV_MASTER;
1331
1332	err = register_netdevice(dev);
1333	if (err)
1334		goto out;
1335
1336	net = dev_net(dev);
1337	add_fib_rules = net_generic(net, vrf_net_id);
1338	if (*add_fib_rules) {
1339		err = vrf_add_fib_rules(dev);
1340		if (err) {
1341			unregister_netdevice(dev);
1342			goto out;
1343		}
1344		*add_fib_rules = false;
1345	}
1346
1347out:
1348	return err;
1349}
1350
1351static size_t vrf_nl_getsize(const struct net_device *dev)
1352{
1353	return nla_total_size(sizeof(u32));  /* IFLA_VRF_TABLE */
1354}
1355
1356static int vrf_fillinfo(struct sk_buff *skb,
1357			const struct net_device *dev)
1358{
1359	struct net_vrf *vrf = netdev_priv(dev);
1360
1361	return nla_put_u32(skb, IFLA_VRF_TABLE, vrf->tb_id);
1362}
1363
1364static size_t vrf_get_slave_size(const struct net_device *bond_dev,
1365				 const struct net_device *slave_dev)
1366{
1367	return nla_total_size(sizeof(u32));  /* IFLA_VRF_PORT_TABLE */
1368}
1369
1370static int vrf_fill_slave_info(struct sk_buff *skb,
1371			       const struct net_device *vrf_dev,
1372			       const struct net_device *slave_dev)
1373{
1374	struct net_vrf *vrf = netdev_priv(vrf_dev);
1375
1376	if (nla_put_u32(skb, IFLA_VRF_PORT_TABLE, vrf->tb_id))
1377		return -EMSGSIZE;
1378
1379	return 0;
1380}
1381
1382static const struct nla_policy vrf_nl_policy[IFLA_VRF_MAX + 1] = {
1383	[IFLA_VRF_TABLE] = { .type = NLA_U32 },
1384};
1385
1386static struct rtnl_link_ops vrf_link_ops __read_mostly = {
1387	.kind		= DRV_NAME,
1388	.priv_size	= sizeof(struct net_vrf),
1389
1390	.get_size	= vrf_nl_getsize,
1391	.policy		= vrf_nl_policy,
1392	.validate	= vrf_validate,
1393	.fill_info	= vrf_fillinfo,
1394
1395	.get_slave_size  = vrf_get_slave_size,
1396	.fill_slave_info = vrf_fill_slave_info,
1397
1398	.newlink	= vrf_newlink,
1399	.dellink	= vrf_dellink,
1400	.setup		= vrf_setup,
1401	.maxtype	= IFLA_VRF_MAX,
1402};
1403
1404static int vrf_device_event(struct notifier_block *unused,
1405			    unsigned long event, void *ptr)
1406{
1407	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1408
1409	/* only care about unregister events to drop slave references */
1410	if (event == NETDEV_UNREGISTER) {
1411		struct net_device *vrf_dev;
1412
1413		if (!netif_is_l3_slave(dev))
1414			goto out;
1415
1416		vrf_dev = netdev_master_upper_dev_get(dev);
1417		vrf_del_slave(vrf_dev, dev);
1418	}
1419out:
1420	return NOTIFY_DONE;
1421}
1422
1423static struct notifier_block vrf_notifier_block __read_mostly = {
1424	.notifier_call = vrf_device_event,
1425};
1426
1427/* Initialize per network namespace state */
1428static int __net_init vrf_netns_init(struct net *net)
1429{
1430	bool *add_fib_rules = net_generic(net, vrf_net_id);
1431
1432	*add_fib_rules = true;
1433
1434	return 0;
1435}
1436
1437static struct pernet_operations vrf_net_ops __net_initdata = {
1438	.init = vrf_netns_init,
1439	.id   = &vrf_net_id,
1440	.size = sizeof(bool),
1441};
1442
1443static int __init vrf_init_module(void)
1444{
1445	int rc;
1446
1447	register_netdevice_notifier(&vrf_notifier_block);
1448
1449	rc = register_pernet_subsys(&vrf_net_ops);
1450	if (rc < 0)
1451		goto error;
1452
1453	rc = rtnl_link_register(&vrf_link_ops);
1454	if (rc < 0) {
1455		unregister_pernet_subsys(&vrf_net_ops);
1456		goto error;
1457	}
1458
1459	return 0;
1460
1461error:
1462	unregister_netdevice_notifier(&vrf_notifier_block);
1463	return rc;
1464}
1465
1466module_init(vrf_init_module);
1467MODULE_AUTHOR("Shrijeet Mukherjee, David Ahern");
1468MODULE_DESCRIPTION("Device driver to instantiate VRF domains");
1469MODULE_LICENSE("GPL");
1470MODULE_ALIAS_RTNL_LINK(DRV_NAME);
1471MODULE_VERSION(DRV_VERSION);