Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2013 Nicira, Inc.
4 */
5
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7
8#include <linux/capability.h>
9#include <linux/module.h>
10#include <linux/types.h>
11#include <linux/kernel.h>
12#include <linux/slab.h>
13#include <linux/uaccess.h>
14#include <linux/skbuff.h>
15#include <linux/netdevice.h>
16#include <linux/in.h>
17#include <linux/tcp.h>
18#include <linux/udp.h>
19#include <linux/if_arp.h>
20#include <linux/init.h>
21#include <linux/in6.h>
22#include <linux/inetdevice.h>
23#include <linux/igmp.h>
24#include <linux/netfilter_ipv4.h>
25#include <linux/etherdevice.h>
26#include <linux/if_ether.h>
27#include <linux/if_vlan.h>
28#include <linux/rculist.h>
29#include <linux/err.h>
30
31#include <net/sock.h>
32#include <net/ip.h>
33#include <net/icmp.h>
34#include <net/protocol.h>
35#include <net/ip_tunnels.h>
36#include <net/arp.h>
37#include <net/checksum.h>
38#include <net/dsfield.h>
39#include <net/inet_ecn.h>
40#include <net/xfrm.h>
41#include <net/net_namespace.h>
42#include <net/netns/generic.h>
43#include <net/rtnetlink.h>
44#include <net/udp.h>
45#include <net/dst_metadata.h>
46
47#if IS_ENABLED(CONFIG_IPV6)
48#include <net/ipv6.h>
49#include <net/ip6_fib.h>
50#include <net/ip6_route.h>
51#endif
52
53static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
54{
55 return hash_32((__force u32)key ^ (__force u32)remote,
56 IP_TNL_HASH_BITS);
57}
58
59static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
60 __be16 flags, __be32 key)
61{
62 if (p->i_flags & TUNNEL_KEY) {
63 if (flags & TUNNEL_KEY)
64 return key == p->i_key;
65 else
66 /* key expected, none present */
67 return false;
68 } else
69 return !(flags & TUNNEL_KEY);
70}
71
72/* Fallback tunnel: no source, no destination, no key, no options
73
74 Tunnel hash table:
75 We require exact key match i.e. if a key is present in packet
76 it will match only tunnel with the same key; if it is not present,
77 it will match only keyless tunnel.
78
79 All keysless packets, if not matched configured keyless tunnels
80 will match fallback tunnel.
81 Given src, dst and key, find appropriate for input tunnel.
82*/
83struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
84 int link, __be16 flags,
85 __be32 remote, __be32 local,
86 __be32 key)
87{
88 struct ip_tunnel *t, *cand = NULL;
89 struct hlist_head *head;
90 struct net_device *ndev;
91 unsigned int hash;
92
93 hash = ip_tunnel_hash(key, remote);
94 head = &itn->tunnels[hash];
95
96 hlist_for_each_entry_rcu(t, head, hash_node) {
97 if (local != t->parms.iph.saddr ||
98 remote != t->parms.iph.daddr ||
99 !(t->dev->flags & IFF_UP))
100 continue;
101
102 if (!ip_tunnel_key_match(&t->parms, flags, key))
103 continue;
104
105 if (t->parms.link == link)
106 return t;
107 else
108 cand = t;
109 }
110
111 hlist_for_each_entry_rcu(t, head, hash_node) {
112 if (remote != t->parms.iph.daddr ||
113 t->parms.iph.saddr != 0 ||
114 !(t->dev->flags & IFF_UP))
115 continue;
116
117 if (!ip_tunnel_key_match(&t->parms, flags, key))
118 continue;
119
120 if (t->parms.link == link)
121 return t;
122 else if (!cand)
123 cand = t;
124 }
125
126 hash = ip_tunnel_hash(key, 0);
127 head = &itn->tunnels[hash];
128
129 hlist_for_each_entry_rcu(t, head, hash_node) {
130 if ((local != t->parms.iph.saddr || t->parms.iph.daddr != 0) &&
131 (local != t->parms.iph.daddr || !ipv4_is_multicast(local)))
132 continue;
133
134 if (!(t->dev->flags & IFF_UP))
135 continue;
136
137 if (!ip_tunnel_key_match(&t->parms, flags, key))
138 continue;
139
140 if (t->parms.link == link)
141 return t;
142 else if (!cand)
143 cand = t;
144 }
145
146 hlist_for_each_entry_rcu(t, head, hash_node) {
147 if ((!(flags & TUNNEL_NO_KEY) && t->parms.i_key != key) ||
148 t->parms.iph.saddr != 0 ||
149 t->parms.iph.daddr != 0 ||
150 !(t->dev->flags & IFF_UP))
151 continue;
152
153 if (t->parms.link == link)
154 return t;
155 else if (!cand)
156 cand = t;
157 }
158
159 if (cand)
160 return cand;
161
162 t = rcu_dereference(itn->collect_md_tun);
163 if (t && t->dev->flags & IFF_UP)
164 return t;
165
166 ndev = READ_ONCE(itn->fb_tunnel_dev);
167 if (ndev && ndev->flags & IFF_UP)
168 return netdev_priv(ndev);
169
170 return NULL;
171}
172EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
173
174static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
175 struct ip_tunnel_parm *parms)
176{
177 unsigned int h;
178 __be32 remote;
179 __be32 i_key = parms->i_key;
180
181 if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
182 remote = parms->iph.daddr;
183 else
184 remote = 0;
185
186 if (!(parms->i_flags & TUNNEL_KEY) && (parms->i_flags & VTI_ISVTI))
187 i_key = 0;
188
189 h = ip_tunnel_hash(i_key, remote);
190 return &itn->tunnels[h];
191}
192
193static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
194{
195 struct hlist_head *head = ip_bucket(itn, &t->parms);
196
197 if (t->collect_md)
198 rcu_assign_pointer(itn->collect_md_tun, t);
199 hlist_add_head_rcu(&t->hash_node, head);
200}
201
202static void ip_tunnel_del(struct ip_tunnel_net *itn, struct ip_tunnel *t)
203{
204 if (t->collect_md)
205 rcu_assign_pointer(itn->collect_md_tun, NULL);
206 hlist_del_init_rcu(&t->hash_node);
207}
208
209static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
210 struct ip_tunnel_parm *parms,
211 int type)
212{
213 __be32 remote = parms->iph.daddr;
214 __be32 local = parms->iph.saddr;
215 __be32 key = parms->i_key;
216 __be16 flags = parms->i_flags;
217 int link = parms->link;
218 struct ip_tunnel *t = NULL;
219 struct hlist_head *head = ip_bucket(itn, parms);
220
221 hlist_for_each_entry_rcu(t, head, hash_node) {
222 if (local == t->parms.iph.saddr &&
223 remote == t->parms.iph.daddr &&
224 link == t->parms.link &&
225 type == t->dev->type &&
226 ip_tunnel_key_match(&t->parms, flags, key))
227 break;
228 }
229 return t;
230}
231
232static struct net_device *__ip_tunnel_create(struct net *net,
233 const struct rtnl_link_ops *ops,
234 struct ip_tunnel_parm *parms)
235{
236 int err;
237 struct ip_tunnel *tunnel;
238 struct net_device *dev;
239 char name[IFNAMSIZ];
240
241 err = -E2BIG;
242 if (parms->name[0]) {
243 if (!dev_valid_name(parms->name))
244 goto failed;
245 strlcpy(name, parms->name, IFNAMSIZ);
246 } else {
247 if (strlen(ops->kind) > (IFNAMSIZ - 3))
248 goto failed;
249 strcpy(name, ops->kind);
250 strcat(name, "%d");
251 }
252
253 ASSERT_RTNL();
254 dev = alloc_netdev(ops->priv_size, name, NET_NAME_UNKNOWN, ops->setup);
255 if (!dev) {
256 err = -ENOMEM;
257 goto failed;
258 }
259 dev_net_set(dev, net);
260
261 dev->rtnl_link_ops = ops;
262
263 tunnel = netdev_priv(dev);
264 tunnel->parms = *parms;
265 tunnel->net = net;
266
267 err = register_netdevice(dev);
268 if (err)
269 goto failed_free;
270
271 return dev;
272
273failed_free:
274 free_netdev(dev);
275failed:
276 return ERR_PTR(err);
277}
278
279static int ip_tunnel_bind_dev(struct net_device *dev)
280{
281 struct net_device *tdev = NULL;
282 struct ip_tunnel *tunnel = netdev_priv(dev);
283 const struct iphdr *iph;
284 int hlen = LL_MAX_HEADER;
285 int mtu = ETH_DATA_LEN;
286 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
287
288 iph = &tunnel->parms.iph;
289
290 /* Guess output device to choose reasonable mtu and needed_headroom */
291 if (iph->daddr) {
292 struct flowi4 fl4;
293 struct rtable *rt;
294
295 ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr,
296 iph->saddr, tunnel->parms.o_key,
297 RT_TOS(iph->tos), tunnel->parms.link,
298 tunnel->fwmark, 0);
299 rt = ip_route_output_key(tunnel->net, &fl4);
300
301 if (!IS_ERR(rt)) {
302 tdev = rt->dst.dev;
303 ip_rt_put(rt);
304 }
305 if (dev->type != ARPHRD_ETHER)
306 dev->flags |= IFF_POINTOPOINT;
307
308 dst_cache_reset(&tunnel->dst_cache);
309 }
310
311 if (!tdev && tunnel->parms.link)
312 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
313
314 if (tdev) {
315 hlen = tdev->hard_header_len + tdev->needed_headroom;
316 mtu = min(tdev->mtu, IP_MAX_MTU);
317 }
318
319 dev->needed_headroom = t_hlen + hlen;
320 mtu -= t_hlen;
321
322 if (mtu < IPV4_MIN_MTU)
323 mtu = IPV4_MIN_MTU;
324
325 return mtu;
326}
327
328static struct ip_tunnel *ip_tunnel_create(struct net *net,
329 struct ip_tunnel_net *itn,
330 struct ip_tunnel_parm *parms)
331{
332 struct ip_tunnel *nt;
333 struct net_device *dev;
334 int t_hlen;
335 int mtu;
336 int err;
337
338 dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
339 if (IS_ERR(dev))
340 return ERR_CAST(dev);
341
342 mtu = ip_tunnel_bind_dev(dev);
343 err = dev_set_mtu(dev, mtu);
344 if (err)
345 goto err_dev_set_mtu;
346
347 nt = netdev_priv(dev);
348 t_hlen = nt->hlen + sizeof(struct iphdr);
349 dev->min_mtu = ETH_MIN_MTU;
350 dev->max_mtu = IP_MAX_MTU - t_hlen;
351 ip_tunnel_add(itn, nt);
352 return nt;
353
354err_dev_set_mtu:
355 unregister_netdevice(dev);
356 return ERR_PTR(err);
357}
358
359int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
360 const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst,
361 bool log_ecn_error)
362{
363 const struct iphdr *iph = ip_hdr(skb);
364 int err;
365
366#ifdef CONFIG_NET_IPGRE_BROADCAST
367 if (ipv4_is_multicast(iph->daddr)) {
368 tunnel->dev->stats.multicast++;
369 skb->pkt_type = PACKET_BROADCAST;
370 }
371#endif
372
373 if ((!(tpi->flags&TUNNEL_CSUM) && (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
374 ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
375 tunnel->dev->stats.rx_crc_errors++;
376 tunnel->dev->stats.rx_errors++;
377 goto drop;
378 }
379
380 if (tunnel->parms.i_flags&TUNNEL_SEQ) {
381 if (!(tpi->flags&TUNNEL_SEQ) ||
382 (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
383 tunnel->dev->stats.rx_fifo_errors++;
384 tunnel->dev->stats.rx_errors++;
385 goto drop;
386 }
387 tunnel->i_seqno = ntohl(tpi->seq) + 1;
388 }
389
390 skb_reset_network_header(skb);
391
392 err = IP_ECN_decapsulate(iph, skb);
393 if (unlikely(err)) {
394 if (log_ecn_error)
395 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
396 &iph->saddr, iph->tos);
397 if (err > 1) {
398 ++tunnel->dev->stats.rx_frame_errors;
399 ++tunnel->dev->stats.rx_errors;
400 goto drop;
401 }
402 }
403
404 dev_sw_netstats_rx_add(tunnel->dev, skb->len);
405 skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
406
407 if (tunnel->dev->type == ARPHRD_ETHER) {
408 skb->protocol = eth_type_trans(skb, tunnel->dev);
409 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
410 } else {
411 skb->dev = tunnel->dev;
412 }
413
414 if (tun_dst)
415 skb_dst_set(skb, (struct dst_entry *)tun_dst);
416
417 gro_cells_receive(&tunnel->gro_cells, skb);
418 return 0;
419
420drop:
421 if (tun_dst)
422 dst_release((struct dst_entry *)tun_dst);
423 kfree_skb(skb);
424 return 0;
425}
426EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
427
428int ip_tunnel_encap_add_ops(const struct ip_tunnel_encap_ops *ops,
429 unsigned int num)
430{
431 if (num >= MAX_IPTUN_ENCAP_OPS)
432 return -ERANGE;
433
434 return !cmpxchg((const struct ip_tunnel_encap_ops **)
435 &iptun_encaps[num],
436 NULL, ops) ? 0 : -1;
437}
438EXPORT_SYMBOL(ip_tunnel_encap_add_ops);
439
440int ip_tunnel_encap_del_ops(const struct ip_tunnel_encap_ops *ops,
441 unsigned int num)
442{
443 int ret;
444
445 if (num >= MAX_IPTUN_ENCAP_OPS)
446 return -ERANGE;
447
448 ret = (cmpxchg((const struct ip_tunnel_encap_ops **)
449 &iptun_encaps[num],
450 ops, NULL) == ops) ? 0 : -1;
451
452 synchronize_net();
453
454 return ret;
455}
456EXPORT_SYMBOL(ip_tunnel_encap_del_ops);
457
458int ip_tunnel_encap_setup(struct ip_tunnel *t,
459 struct ip_tunnel_encap *ipencap)
460{
461 int hlen;
462
463 memset(&t->encap, 0, sizeof(t->encap));
464
465 hlen = ip_encap_hlen(ipencap);
466 if (hlen < 0)
467 return hlen;
468
469 t->encap.type = ipencap->type;
470 t->encap.sport = ipencap->sport;
471 t->encap.dport = ipencap->dport;
472 t->encap.flags = ipencap->flags;
473
474 t->encap_hlen = hlen;
475 t->hlen = t->encap_hlen + t->tun_hlen;
476
477 return 0;
478}
479EXPORT_SYMBOL_GPL(ip_tunnel_encap_setup);
480
481static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
482 struct rtable *rt, __be16 df,
483 const struct iphdr *inner_iph,
484 int tunnel_hlen, __be32 dst, bool md)
485{
486 struct ip_tunnel *tunnel = netdev_priv(dev);
487 int pkt_size;
488 int mtu;
489
490 tunnel_hlen = md ? tunnel_hlen : tunnel->hlen;
491 pkt_size = skb->len - tunnel_hlen;
492
493 if (df)
494 mtu = dst_mtu(&rt->dst) - (sizeof(struct iphdr) + tunnel_hlen);
495 else
496 mtu = skb_valid_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
497
498 if (skb_valid_dst(skb))
499 skb_dst_update_pmtu_no_confirm(skb, mtu);
500
501 if (skb->protocol == htons(ETH_P_IP)) {
502 if (!skb_is_gso(skb) &&
503 (inner_iph->frag_off & htons(IP_DF)) &&
504 mtu < pkt_size) {
505 memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
506 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
507 return -E2BIG;
508 }
509 }
510#if IS_ENABLED(CONFIG_IPV6)
511 else if (skb->protocol == htons(ETH_P_IPV6)) {
512 struct rt6_info *rt6;
513 __be32 daddr;
514
515 rt6 = skb_valid_dst(skb) ? (struct rt6_info *)skb_dst(skb) :
516 NULL;
517 daddr = md ? dst : tunnel->parms.iph.daddr;
518
519 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
520 mtu >= IPV6_MIN_MTU) {
521 if ((daddr && !ipv4_is_multicast(daddr)) ||
522 rt6->rt6i_dst.plen == 128) {
523 rt6->rt6i_flags |= RTF_MODIFIED;
524 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
525 }
526 }
527
528 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
529 mtu < pkt_size) {
530 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
531 return -E2BIG;
532 }
533 }
534#endif
535 return 0;
536}
537
538void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
539 u8 proto, int tunnel_hlen)
540{
541 struct ip_tunnel *tunnel = netdev_priv(dev);
542 u32 headroom = sizeof(struct iphdr);
543 struct ip_tunnel_info *tun_info;
544 const struct ip_tunnel_key *key;
545 const struct iphdr *inner_iph;
546 struct rtable *rt = NULL;
547 struct flowi4 fl4;
548 __be16 df = 0;
549 u8 tos, ttl;
550 bool use_cache;
551
552 tun_info = skb_tunnel_info(skb);
553 if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
554 ip_tunnel_info_af(tun_info) != AF_INET))
555 goto tx_error;
556 key = &tun_info->key;
557 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
558 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
559 tos = key->tos;
560 if (tos == 1) {
561 if (skb->protocol == htons(ETH_P_IP))
562 tos = inner_iph->tos;
563 else if (skb->protocol == htons(ETH_P_IPV6))
564 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
565 }
566 ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src,
567 tunnel_id_to_key32(key->tun_id), RT_TOS(tos),
568 0, skb->mark, skb_get_hash(skb));
569 if (tunnel->encap.type != TUNNEL_ENCAP_NONE)
570 goto tx_error;
571
572 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
573 if (use_cache)
574 rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl4.saddr);
575 if (!rt) {
576 rt = ip_route_output_key(tunnel->net, &fl4);
577 if (IS_ERR(rt)) {
578 dev->stats.tx_carrier_errors++;
579 goto tx_error;
580 }
581 if (use_cache)
582 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
583 fl4.saddr);
584 }
585 if (rt->dst.dev == dev) {
586 ip_rt_put(rt);
587 dev->stats.collisions++;
588 goto tx_error;
589 }
590
591 if (key->tun_flags & TUNNEL_DONT_FRAGMENT)
592 df = htons(IP_DF);
593 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, tunnel_hlen,
594 key->u.ipv4.dst, true)) {
595 ip_rt_put(rt);
596 goto tx_error;
597 }
598
599 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
600 ttl = key->ttl;
601 if (ttl == 0) {
602 if (skb->protocol == htons(ETH_P_IP))
603 ttl = inner_iph->ttl;
604 else if (skb->protocol == htons(ETH_P_IPV6))
605 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
606 else
607 ttl = ip4_dst_hoplimit(&rt->dst);
608 }
609
610 headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
611 if (headroom > dev->needed_headroom)
612 dev->needed_headroom = headroom;
613
614 if (skb_cow_head(skb, dev->needed_headroom)) {
615 ip_rt_put(rt);
616 goto tx_dropped;
617 }
618 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
619 df, !net_eq(tunnel->net, dev_net(dev)));
620 return;
621tx_error:
622 dev->stats.tx_errors++;
623 goto kfree;
624tx_dropped:
625 dev->stats.tx_dropped++;
626kfree:
627 kfree_skb(skb);
628}
629EXPORT_SYMBOL_GPL(ip_md_tunnel_xmit);
630
631void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
632 const struct iphdr *tnl_params, u8 protocol)
633{
634 struct ip_tunnel *tunnel = netdev_priv(dev);
635 struct ip_tunnel_info *tun_info = NULL;
636 const struct iphdr *inner_iph;
637 unsigned int max_headroom; /* The extra header space needed */
638 struct rtable *rt = NULL; /* Route to the other host */
639 bool use_cache = false;
640 struct flowi4 fl4;
641 bool md = false;
642 bool connected;
643 u8 tos, ttl;
644 __be32 dst;
645 __be16 df;
646
647 inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
648 connected = (tunnel->parms.iph.daddr != 0);
649
650 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
651
652 dst = tnl_params->daddr;
653 if (dst == 0) {
654 /* NBMA tunnel */
655
656 if (!skb_dst(skb)) {
657 dev->stats.tx_fifo_errors++;
658 goto tx_error;
659 }
660
661 tun_info = skb_tunnel_info(skb);
662 if (tun_info && (tun_info->mode & IP_TUNNEL_INFO_TX) &&
663 ip_tunnel_info_af(tun_info) == AF_INET &&
664 tun_info->key.u.ipv4.dst) {
665 dst = tun_info->key.u.ipv4.dst;
666 md = true;
667 connected = true;
668 }
669 else if (skb->protocol == htons(ETH_P_IP)) {
670 rt = skb_rtable(skb);
671 dst = rt_nexthop(rt, inner_iph->daddr);
672 }
673#if IS_ENABLED(CONFIG_IPV6)
674 else if (skb->protocol == htons(ETH_P_IPV6)) {
675 const struct in6_addr *addr6;
676 struct neighbour *neigh;
677 bool do_tx_error_icmp;
678 int addr_type;
679
680 neigh = dst_neigh_lookup(skb_dst(skb),
681 &ipv6_hdr(skb)->daddr);
682 if (!neigh)
683 goto tx_error;
684
685 addr6 = (const struct in6_addr *)&neigh->primary_key;
686 addr_type = ipv6_addr_type(addr6);
687
688 if (addr_type == IPV6_ADDR_ANY) {
689 addr6 = &ipv6_hdr(skb)->daddr;
690 addr_type = ipv6_addr_type(addr6);
691 }
692
693 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
694 do_tx_error_icmp = true;
695 else {
696 do_tx_error_icmp = false;
697 dst = addr6->s6_addr32[3];
698 }
699 neigh_release(neigh);
700 if (do_tx_error_icmp)
701 goto tx_error_icmp;
702 }
703#endif
704 else
705 goto tx_error;
706
707 if (!md)
708 connected = false;
709 }
710
711 tos = tnl_params->tos;
712 if (tos & 0x1) {
713 tos &= ~0x1;
714 if (skb->protocol == htons(ETH_P_IP)) {
715 tos = inner_iph->tos;
716 connected = false;
717 } else if (skb->protocol == htons(ETH_P_IPV6)) {
718 tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
719 connected = false;
720 }
721 }
722
723 ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr,
724 tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link,
725 tunnel->fwmark, skb_get_hash(skb));
726
727 if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
728 goto tx_error;
729
730 if (connected && md) {
731 use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
732 if (use_cache)
733 rt = dst_cache_get_ip4(&tun_info->dst_cache,
734 &fl4.saddr);
735 } else {
736 rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache,
737 &fl4.saddr) : NULL;
738 }
739
740 if (!rt) {
741 rt = ip_route_output_key(tunnel->net, &fl4);
742
743 if (IS_ERR(rt)) {
744 dev->stats.tx_carrier_errors++;
745 goto tx_error;
746 }
747 if (use_cache)
748 dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
749 fl4.saddr);
750 else if (!md && connected)
751 dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
752 fl4.saddr);
753 }
754
755 if (rt->dst.dev == dev) {
756 ip_rt_put(rt);
757 dev->stats.collisions++;
758 goto tx_error;
759 }
760
761 df = tnl_params->frag_off;
762 if (skb->protocol == htons(ETH_P_IP) && !tunnel->ignore_df)
763 df |= (inner_iph->frag_off & htons(IP_DF));
764
765 if (tnl_update_pmtu(dev, skb, rt, df, inner_iph, 0, 0, false)) {
766 ip_rt_put(rt);
767 goto tx_error;
768 }
769
770 if (tunnel->err_count > 0) {
771 if (time_before(jiffies,
772 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
773 tunnel->err_count--;
774
775 dst_link_failure(skb);
776 } else
777 tunnel->err_count = 0;
778 }
779
780 tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
781 ttl = tnl_params->ttl;
782 if (ttl == 0) {
783 if (skb->protocol == htons(ETH_P_IP))
784 ttl = inner_iph->ttl;
785#if IS_ENABLED(CONFIG_IPV6)
786 else if (skb->protocol == htons(ETH_P_IPV6))
787 ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
788#endif
789 else
790 ttl = ip4_dst_hoplimit(&rt->dst);
791 }
792
793 max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
794 + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
795 if (max_headroom > dev->needed_headroom)
796 dev->needed_headroom = max_headroom;
797
798 if (skb_cow_head(skb, dev->needed_headroom)) {
799 ip_rt_put(rt);
800 dev->stats.tx_dropped++;
801 kfree_skb(skb);
802 return;
803 }
804
805 iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
806 df, !net_eq(tunnel->net, dev_net(dev)));
807 return;
808
809#if IS_ENABLED(CONFIG_IPV6)
810tx_error_icmp:
811 dst_link_failure(skb);
812#endif
813tx_error:
814 dev->stats.tx_errors++;
815 kfree_skb(skb);
816}
817EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
818
819static void ip_tunnel_update(struct ip_tunnel_net *itn,
820 struct ip_tunnel *t,
821 struct net_device *dev,
822 struct ip_tunnel_parm *p,
823 bool set_mtu,
824 __u32 fwmark)
825{
826 ip_tunnel_del(itn, t);
827 t->parms.iph.saddr = p->iph.saddr;
828 t->parms.iph.daddr = p->iph.daddr;
829 t->parms.i_key = p->i_key;
830 t->parms.o_key = p->o_key;
831 if (dev->type != ARPHRD_ETHER) {
832 memcpy(dev->dev_addr, &p->iph.saddr, 4);
833 memcpy(dev->broadcast, &p->iph.daddr, 4);
834 }
835 ip_tunnel_add(itn, t);
836
837 t->parms.iph.ttl = p->iph.ttl;
838 t->parms.iph.tos = p->iph.tos;
839 t->parms.iph.frag_off = p->iph.frag_off;
840
841 if (t->parms.link != p->link || t->fwmark != fwmark) {
842 int mtu;
843
844 t->parms.link = p->link;
845 t->fwmark = fwmark;
846 mtu = ip_tunnel_bind_dev(dev);
847 if (set_mtu)
848 dev->mtu = mtu;
849 }
850 dst_cache_reset(&t->dst_cache);
851 netdev_state_change(dev);
852}
853
854int ip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
855{
856 int err = 0;
857 struct ip_tunnel *t = netdev_priv(dev);
858 struct net *net = t->net;
859 struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
860
861 switch (cmd) {
862 case SIOCGETTUNNEL:
863 if (dev == itn->fb_tunnel_dev) {
864 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
865 if (!t)
866 t = netdev_priv(dev);
867 }
868 memcpy(p, &t->parms, sizeof(*p));
869 break;
870
871 case SIOCADDTUNNEL:
872 case SIOCCHGTUNNEL:
873 err = -EPERM;
874 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
875 goto done;
876 if (p->iph.ttl)
877 p->iph.frag_off |= htons(IP_DF);
878 if (!(p->i_flags & VTI_ISVTI)) {
879 if (!(p->i_flags & TUNNEL_KEY))
880 p->i_key = 0;
881 if (!(p->o_flags & TUNNEL_KEY))
882 p->o_key = 0;
883 }
884
885 t = ip_tunnel_find(itn, p, itn->type);
886
887 if (cmd == SIOCADDTUNNEL) {
888 if (!t) {
889 t = ip_tunnel_create(net, itn, p);
890 err = PTR_ERR_OR_ZERO(t);
891 break;
892 }
893
894 err = -EEXIST;
895 break;
896 }
897 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
898 if (t) {
899 if (t->dev != dev) {
900 err = -EEXIST;
901 break;
902 }
903 } else {
904 unsigned int nflags = 0;
905
906 if (ipv4_is_multicast(p->iph.daddr))
907 nflags = IFF_BROADCAST;
908 else if (p->iph.daddr)
909 nflags = IFF_POINTOPOINT;
910
911 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
912 err = -EINVAL;
913 break;
914 }
915
916 t = netdev_priv(dev);
917 }
918 }
919
920 if (t) {
921 err = 0;
922 ip_tunnel_update(itn, t, dev, p, true, 0);
923 } else {
924 err = -ENOENT;
925 }
926 break;
927
928 case SIOCDELTUNNEL:
929 err = -EPERM;
930 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
931 goto done;
932
933 if (dev == itn->fb_tunnel_dev) {
934 err = -ENOENT;
935 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
936 if (!t)
937 goto done;
938 err = -EPERM;
939 if (t == netdev_priv(itn->fb_tunnel_dev))
940 goto done;
941 dev = t->dev;
942 }
943 unregister_netdevice(dev);
944 err = 0;
945 break;
946
947 default:
948 err = -EINVAL;
949 }
950
951done:
952 return err;
953}
954EXPORT_SYMBOL_GPL(ip_tunnel_ctl);
955
956int ip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
957{
958 struct ip_tunnel_parm p;
959 int err;
960
961 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
962 return -EFAULT;
963 err = dev->netdev_ops->ndo_tunnel_ctl(dev, &p, cmd);
964 if (!err && copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
965 return -EFAULT;
966 return err;
967}
968EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
969
970int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
971{
972 struct ip_tunnel *tunnel = netdev_priv(dev);
973 int t_hlen = tunnel->hlen + sizeof(struct iphdr);
974 int max_mtu = IP_MAX_MTU - t_hlen;
975
976 if (new_mtu < ETH_MIN_MTU)
977 return -EINVAL;
978
979 if (new_mtu > max_mtu) {
980 if (strict)
981 return -EINVAL;
982
983 new_mtu = max_mtu;
984 }
985
986 dev->mtu = new_mtu;
987 return 0;
988}
989EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
990
991int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
992{
993 return __ip_tunnel_change_mtu(dev, new_mtu, true);
994}
995EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
996
997static void ip_tunnel_dev_free(struct net_device *dev)
998{
999 struct ip_tunnel *tunnel = netdev_priv(dev);
1000
1001 gro_cells_destroy(&tunnel->gro_cells);
1002 dst_cache_destroy(&tunnel->dst_cache);
1003 free_percpu(dev->tstats);
1004}
1005
1006void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
1007{
1008 struct ip_tunnel *tunnel = netdev_priv(dev);
1009 struct ip_tunnel_net *itn;
1010
1011 itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
1012
1013 if (itn->fb_tunnel_dev != dev) {
1014 ip_tunnel_del(itn, netdev_priv(dev));
1015 unregister_netdevice_queue(dev, head);
1016 }
1017}
1018EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
1019
1020struct net *ip_tunnel_get_link_net(const struct net_device *dev)
1021{
1022 struct ip_tunnel *tunnel = netdev_priv(dev);
1023
1024 return tunnel->net;
1025}
1026EXPORT_SYMBOL(ip_tunnel_get_link_net);
1027
1028int ip_tunnel_get_iflink(const struct net_device *dev)
1029{
1030 struct ip_tunnel *tunnel = netdev_priv(dev);
1031
1032 return tunnel->parms.link;
1033}
1034EXPORT_SYMBOL(ip_tunnel_get_iflink);
1035
1036int ip_tunnel_init_net(struct net *net, unsigned int ip_tnl_net_id,
1037 struct rtnl_link_ops *ops, char *devname)
1038{
1039 struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
1040 struct ip_tunnel_parm parms;
1041 unsigned int i;
1042
1043 itn->rtnl_link_ops = ops;
1044 for (i = 0; i < IP_TNL_HASH_SIZE; i++)
1045 INIT_HLIST_HEAD(&itn->tunnels[i]);
1046
1047 if (!ops || !net_has_fallback_tunnels(net)) {
1048 struct ip_tunnel_net *it_init_net;
1049
1050 it_init_net = net_generic(&init_net, ip_tnl_net_id);
1051 itn->type = it_init_net->type;
1052 itn->fb_tunnel_dev = NULL;
1053 return 0;
1054 }
1055
1056 memset(&parms, 0, sizeof(parms));
1057 if (devname)
1058 strlcpy(parms.name, devname, IFNAMSIZ);
1059
1060 rtnl_lock();
1061 itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
1062 /* FB netdevice is special: we have one, and only one per netns.
1063 * Allowing to move it to another netns is clearly unsafe.
1064 */
1065 if (!IS_ERR(itn->fb_tunnel_dev)) {
1066 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1067 itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
1068 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
1069 itn->type = itn->fb_tunnel_dev->type;
1070 }
1071 rtnl_unlock();
1072
1073 return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
1074}
1075EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
1076
1077static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
1078 struct list_head *head,
1079 struct rtnl_link_ops *ops)
1080{
1081 struct net_device *dev, *aux;
1082 int h;
1083
1084 for_each_netdev_safe(net, dev, aux)
1085 if (dev->rtnl_link_ops == ops)
1086 unregister_netdevice_queue(dev, head);
1087
1088 for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
1089 struct ip_tunnel *t;
1090 struct hlist_node *n;
1091 struct hlist_head *thead = &itn->tunnels[h];
1092
1093 hlist_for_each_entry_safe(t, n, thead, hash_node)
1094 /* If dev is in the same netns, it has already
1095 * been added to the list by the previous loop.
1096 */
1097 if (!net_eq(dev_net(t->dev), net))
1098 unregister_netdevice_queue(t->dev, head);
1099 }
1100}
1101
1102void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id,
1103 struct rtnl_link_ops *ops)
1104{
1105 struct ip_tunnel_net *itn;
1106 struct net *net;
1107 LIST_HEAD(list);
1108
1109 rtnl_lock();
1110 list_for_each_entry(net, net_list, exit_list) {
1111 itn = net_generic(net, id);
1112 ip_tunnel_destroy(net, itn, &list, ops);
1113 }
1114 unregister_netdevice_many(&list);
1115 rtnl_unlock();
1116}
1117EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets);
1118
1119int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
1120 struct ip_tunnel_parm *p, __u32 fwmark)
1121{
1122 struct ip_tunnel *nt;
1123 struct net *net = dev_net(dev);
1124 struct ip_tunnel_net *itn;
1125 int mtu;
1126 int err;
1127
1128 nt = netdev_priv(dev);
1129 itn = net_generic(net, nt->ip_tnl_net_id);
1130
1131 if (nt->collect_md) {
1132 if (rtnl_dereference(itn->collect_md_tun))
1133 return -EEXIST;
1134 } else {
1135 if (ip_tunnel_find(itn, p, dev->type))
1136 return -EEXIST;
1137 }
1138
1139 nt->net = net;
1140 nt->parms = *p;
1141 nt->fwmark = fwmark;
1142 err = register_netdevice(dev);
1143 if (err)
1144 goto err_register_netdevice;
1145
1146 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1147 eth_hw_addr_random(dev);
1148
1149 mtu = ip_tunnel_bind_dev(dev);
1150 if (tb[IFLA_MTU]) {
1151 unsigned int max = IP_MAX_MTU - (nt->hlen + sizeof(struct iphdr));
1152
1153 mtu = clamp(dev->mtu, (unsigned int)ETH_MIN_MTU, max);
1154 }
1155
1156 err = dev_set_mtu(dev, mtu);
1157 if (err)
1158 goto err_dev_set_mtu;
1159
1160 ip_tunnel_add(itn, nt);
1161 return 0;
1162
1163err_dev_set_mtu:
1164 unregister_netdevice(dev);
1165err_register_netdevice:
1166 return err;
1167}
1168EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1169
1170int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1171 struct ip_tunnel_parm *p, __u32 fwmark)
1172{
1173 struct ip_tunnel *t;
1174 struct ip_tunnel *tunnel = netdev_priv(dev);
1175 struct net *net = tunnel->net;
1176 struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1177
1178 if (dev == itn->fb_tunnel_dev)
1179 return -EINVAL;
1180
1181 t = ip_tunnel_find(itn, p, dev->type);
1182
1183 if (t) {
1184 if (t->dev != dev)
1185 return -EEXIST;
1186 } else {
1187 t = tunnel;
1188
1189 if (dev->type != ARPHRD_ETHER) {
1190 unsigned int nflags = 0;
1191
1192 if (ipv4_is_multicast(p->iph.daddr))
1193 nflags = IFF_BROADCAST;
1194 else if (p->iph.daddr)
1195 nflags = IFF_POINTOPOINT;
1196
1197 if ((dev->flags ^ nflags) &
1198 (IFF_POINTOPOINT | IFF_BROADCAST))
1199 return -EINVAL;
1200 }
1201 }
1202
1203 ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark);
1204 return 0;
1205}
1206EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1207
1208int ip_tunnel_init(struct net_device *dev)
1209{
1210 struct ip_tunnel *tunnel = netdev_priv(dev);
1211 struct iphdr *iph = &tunnel->parms.iph;
1212 int err;
1213
1214 dev->needs_free_netdev = true;
1215 dev->priv_destructor = ip_tunnel_dev_free;
1216 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
1217 if (!dev->tstats)
1218 return -ENOMEM;
1219
1220 err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
1221 if (err) {
1222 free_percpu(dev->tstats);
1223 return err;
1224 }
1225
1226 err = gro_cells_init(&tunnel->gro_cells, dev);
1227 if (err) {
1228 dst_cache_destroy(&tunnel->dst_cache);
1229 free_percpu(dev->tstats);
1230 return err;
1231 }
1232
1233 tunnel->dev = dev;
1234 tunnel->net = dev_net(dev);
1235 strcpy(tunnel->parms.name, dev->name);
1236 iph->version = 4;
1237 iph->ihl = 5;
1238
1239 if (tunnel->collect_md)
1240 netif_keep_dst(dev);
1241 return 0;
1242}
1243EXPORT_SYMBOL_GPL(ip_tunnel_init);
1244
1245void ip_tunnel_uninit(struct net_device *dev)
1246{
1247 struct ip_tunnel *tunnel = netdev_priv(dev);
1248 struct net *net = tunnel->net;
1249 struct ip_tunnel_net *itn;
1250
1251 itn = net_generic(net, tunnel->ip_tnl_net_id);
1252 ip_tunnel_del(itn, netdev_priv(dev));
1253 if (itn->fb_tunnel_dev == dev)
1254 WRITE_ONCE(itn->fb_tunnel_dev, NULL);
1255
1256 dst_cache_reset(&tunnel->dst_cache);
1257}
1258EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1259
1260/* Do least required initialization, rest of init is done in tunnel_init call */
1261void ip_tunnel_setup(struct net_device *dev, unsigned int net_id)
1262{
1263 struct ip_tunnel *tunnel = netdev_priv(dev);
1264 tunnel->ip_tnl_net_id = net_id;
1265}
1266EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1267
1268MODULE_LICENSE("GPL");