net/ipv6/ip6_output.c at v4.16-rc5

tjh.dev / kernel
fork
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork
kernel / net / ipv6 / ip6_output.c
at v4.16-rc5 1767 lines 46 kB view raw
wrap content
   1/*
   2 *	IPv6 output functions
   3 *	Linux INET6 implementation
   4 *
   5 *	Authors:
   6 *	Pedro Roque		<roque@di.fc.ul.pt>
   7 *
   8 *	Based on linux/net/ipv4/ip_output.c
   9 *
  10 *	This program is free software; you can redistribute it and/or
  11 *      modify it under the terms of the GNU General Public License
  12 *      as published by the Free Software Foundation; either version
  13 *      2 of the License, or (at your option) any later version.
  14 *
  15 *	Changes:
  16 *	A.N.Kuznetsov	:	airthmetics in fragmentation.
  17 *				extension headers are implemented.
  18 *				route changes now work.
  19 *				ip6_forward does not confuse sniffers.
  20 *				etc.
  21 *
  22 *      H. von Brand    :       Added missing #include <linux/string.h>
  23 *	Imran Patel	:	frag id should be in NBO
  24 *      Kazunori MIYAZAWA @USAGI
  25 *			:       add ip6_append_data and related functions
  26 *				for datagram xmit
  27 */
  28
  29#include <linux/errno.h>
  30#include <linux/kernel.h>
  31#include <linux/string.h>
  32#include <linux/socket.h>
  33#include <linux/net.h>
  34#include <linux/netdevice.h>
  35#include <linux/if_arp.h>
  36#include <linux/in6.h>
  37#include <linux/tcp.h>
  38#include <linux/route.h>
  39#include <linux/module.h>
  40#include <linux/slab.h>
  41
  42#include <linux/bpf-cgroup.h>
  43#include <linux/netfilter.h>
  44#include <linux/netfilter_ipv6.h>
  45
  46#include <net/sock.h>
  47#include <net/snmp.h>
  48
  49#include <net/ipv6.h>
  50#include <net/ndisc.h>
  51#include <net/protocol.h>
  52#include <net/ip6_route.h>
  53#include <net/addrconf.h>
  54#include <net/rawv6.h>
  55#include <net/icmp.h>
  56#include <net/xfrm.h>
  57#include <net/checksum.h>
  58#include <linux/mroute6.h>
  59#include <net/l3mdev.h>
  60#include <net/lwtunnel.h>
  61
  62static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
  63{
  64	struct dst_entry *dst = skb_dst(skb);
  65	struct net_device *dev = dst->dev;
  66	struct neighbour *neigh;
  67	struct in6_addr *nexthop;
  68	int ret;
  69
  70	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
  71		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
  72
  73		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
  74		    ((mroute6_socket(net, skb) &&
  75		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
  76		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
  77					 &ipv6_hdr(skb)->saddr))) {
  78			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
  79
  80			/* Do not check for IFF_ALLMULTI; multicast routing
  81			   is not supported in any case.
  82			 */
  83			if (newskb)
  84				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
  85					net, sk, newskb, NULL, newskb->dev,
  86					dev_loopback_xmit);
  87
  88			if (ipv6_hdr(skb)->hop_limit == 0) {
  89				IP6_INC_STATS(net, idev,
  90					      IPSTATS_MIB_OUTDISCARDS);
  91				kfree_skb(skb);
  92				return 0;
  93			}
  94		}
  95
  96		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
  97
  98		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
  99		    IPV6_ADDR_SCOPE_NODELOCAL &&
 100		    !(dev->flags & IFF_LOOPBACK)) {
 101			kfree_skb(skb);
 102			return 0;
 103		}
 104	}
 105
 106	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
 107		int res = lwtunnel_xmit(skb);
 108
 109		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
 110			return res;
 111	}
 112
 113	rcu_read_lock_bh();
 114	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 115	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
 116	if (unlikely(!neigh))
 117		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
 118	if (!IS_ERR(neigh)) {
 119		sock_confirm_neigh(skb, neigh);
 120		ret = neigh_output(neigh, skb);
 121		rcu_read_unlock_bh();
 122		return ret;
 123	}
 124	rcu_read_unlock_bh();
 125
 126	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 127	kfree_skb(skb);
 128	return -EINVAL;
 129}
 130
 131static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 132{
 133	int ret;
 134
 135	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
 136	if (ret) {
 137		kfree_skb(skb);
 138		return ret;
 139	}
 140
 141#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 142	/* Policy lookup after SNAT yielded a new policy */
 143	if (skb_dst(skb)->xfrm) {
 144		IPCB(skb)->flags |= IPSKB_REROUTED;
 145		return dst_output(net, sk, skb);
 146	}
 147#endif
 148
 149	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
 150	    dst_allfrag(skb_dst(skb)) ||
 151	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
 152		return ip6_fragment(net, sk, skb, ip6_finish_output2);
 153	else
 154		return ip6_finish_output2(net, sk, skb);
 155}
 156
 157int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
 158{
 159	struct net_device *dev = skb_dst(skb)->dev;
 160	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
 161
 162	skb->protocol = htons(ETH_P_IPV6);
 163	skb->dev = dev;
 164
 165	if (unlikely(idev->cnf.disable_ipv6)) {
 166		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 167		kfree_skb(skb);
 168		return 0;
 169	}
 170
 171	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
 172			    net, sk, skb, NULL, dev,
 173			    ip6_finish_output,
 174			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 175}
 176
 177bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
 178{
 179	if (!np->autoflowlabel_set)
 180		return ip6_default_np_autolabel(net);
 181	else
 182		return np->autoflowlabel;
 183}
 184
 185/*
 186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
 187 * Note : socket lock is not held for SYNACK packets, but might be modified
 188 * by calls to skb_set_owner_w() and ipv6_local_error(),
 189 * which are using proper atomic operations or spinlocks.
 190 */
 191int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 192	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
 193{
 194	struct net *net = sock_net(sk);
 195	const struct ipv6_pinfo *np = inet6_sk(sk);
 196	struct in6_addr *first_hop = &fl6->daddr;
 197	struct dst_entry *dst = skb_dst(skb);
 198	struct ipv6hdr *hdr;
 199	u8  proto = fl6->flowi6_proto;
 200	int seg_len = skb->len;
 201	int hlimit = -1;
 202	u32 mtu;
 203
 204	if (opt) {
 205		unsigned int head_room;
 206
 207		/* First: exthdrs may take lots of space (~8K for now)
 208		   MAX_HEADER is not enough.
 209		 */
 210		head_room = opt->opt_nflen + opt->opt_flen;
 211		seg_len += head_room;
 212		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
 213
 214		if (skb_headroom(skb) < head_room) {
 215			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
 216			if (!skb2) {
 217				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 218					      IPSTATS_MIB_OUTDISCARDS);
 219				kfree_skb(skb);
 220				return -ENOBUFS;
 221			}
 222			consume_skb(skb);
 223			skb = skb2;
 224			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
 225			 * it is safe to call in our context (socket lock not held)
 226			 */
 227			skb_set_owner_w(skb, (struct sock *)sk);
 228		}
 229		if (opt->opt_flen)
 230			ipv6_push_frag_opts(skb, opt, &proto);
 231		if (opt->opt_nflen)
 232			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
 233					     &fl6->saddr);
 234	}
 235
 236	skb_push(skb, sizeof(struct ipv6hdr));
 237	skb_reset_network_header(skb);
 238	hdr = ipv6_hdr(skb);
 239
 240	/*
 241	 *	Fill in the IPv6 header
 242	 */
 243	if (np)
 244		hlimit = np->hop_limit;
 245	if (hlimit < 0)
 246		hlimit = ip6_dst_hoplimit(dst);
 247
 248	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
 249				ip6_autoflowlabel(net, np), fl6));
 250
 251	hdr->payload_len = htons(seg_len);
 252	hdr->nexthdr = proto;
 253	hdr->hop_limit = hlimit;
 254
 255	hdr->saddr = fl6->saddr;
 256	hdr->daddr = *first_hop;
 257
 258	skb->protocol = htons(ETH_P_IPV6);
 259	skb->priority = sk->sk_priority;
 260	skb->mark = mark;
 261
 262	mtu = dst_mtu(dst);
 263	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
 264		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
 265			      IPSTATS_MIB_OUT, skb->len);
 266
 267		/* if egress device is enslaved to an L3 master device pass the
 268		 * skb to its handler for processing
 269		 */
 270		skb = l3mdev_ip6_out((struct sock *)sk, skb);
 271		if (unlikely(!skb))
 272			return 0;
 273
 274		/* hooks should never assume socket lock is held.
 275		 * we promote our socket to non const
 276		 */
 277		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
 278			       net, (struct sock *)sk, skb, NULL, dst->dev,
 279			       dst_output);
 280	}
 281
 282	skb->dev = dst->dev;
 283	/* ipv6_local_error() does not require socket lock,
 284	 * we promote our socket to non const
 285	 */
 286	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
 287
 288	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
 289	kfree_skb(skb);
 290	return -EMSGSIZE;
 291}
 292EXPORT_SYMBOL(ip6_xmit);
 293
 294static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
 295{
 296	struct ip6_ra_chain *ra;
 297	struct sock *last = NULL;
 298
 299	read_lock(&ip6_ra_lock);
 300	for (ra = ip6_ra_chain; ra; ra = ra->next) {
 301		struct sock *sk = ra->sk;
 302		if (sk && ra->sel == sel &&
 303		    (!sk->sk_bound_dev_if ||
 304		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
 305			if (last) {
 306				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 307				if (skb2)
 308					rawv6_rcv(last, skb2);
 309			}
 310			last = sk;
 311		}
 312	}
 313
 314	if (last) {
 315		rawv6_rcv(last, skb);
 316		read_unlock(&ip6_ra_lock);
 317		return 1;
 318	}
 319	read_unlock(&ip6_ra_lock);
 320	return 0;
 321}
 322
 323static int ip6_forward_proxy_check(struct sk_buff *skb)
 324{
 325	struct ipv6hdr *hdr = ipv6_hdr(skb);
 326	u8 nexthdr = hdr->nexthdr;
 327	__be16 frag_off;
 328	int offset;
 329
 330	if (ipv6_ext_hdr(nexthdr)) {
 331		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
 332		if (offset < 0)
 333			return 0;
 334	} else
 335		offset = sizeof(struct ipv6hdr);
 336
 337	if (nexthdr == IPPROTO_ICMPV6) {
 338		struct icmp6hdr *icmp6;
 339
 340		if (!pskb_may_pull(skb, (skb_network_header(skb) +
 341					 offset + 1 - skb->data)))
 342			return 0;
 343
 344		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
 345
 346		switch (icmp6->icmp6_type) {
 347		case NDISC_ROUTER_SOLICITATION:
 348		case NDISC_ROUTER_ADVERTISEMENT:
 349		case NDISC_NEIGHBOUR_SOLICITATION:
 350		case NDISC_NEIGHBOUR_ADVERTISEMENT:
 351		case NDISC_REDIRECT:
 352			/* For reaction involving unicast neighbor discovery
 353			 * message destined to the proxied address, pass it to
 354			 * input function.
 355			 */
 356			return 1;
 357		default:
 358			break;
 359		}
 360	}
 361
 362	/*
 363	 * The proxying router can't forward traffic sent to a link-local
 364	 * address, so signal the sender and discard the packet. This
 365	 * behavior is clarified by the MIPv6 specification.
 366	 */
 367	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
 368		dst_link_failure(skb);
 369		return -1;
 370	}
 371
 372	return 0;
 373}
 374
 375static inline int ip6_forward_finish(struct net *net, struct sock *sk,
 376				     struct sk_buff *skb)
 377{
 378	return dst_output(net, sk, skb);
 379}
 380
 381unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
 382{
 383	unsigned int mtu;
 384	struct inet6_dev *idev;
 385
 386	if (dst_metric_locked(dst, RTAX_MTU)) {
 387		mtu = dst_metric_raw(dst, RTAX_MTU);
 388		if (mtu)
 389			return mtu;
 390	}
 391
 392	mtu = IPV6_MIN_MTU;
 393	rcu_read_lock();
 394	idev = __in6_dev_get(dst->dev);
 395	if (idev)
 396		mtu = idev->cnf.mtu6;
 397	rcu_read_unlock();
 398
 399	return mtu;
 400}
 401EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
 402
 403static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 404{
 405	if (skb->len <= mtu)
 406		return false;
 407
 408	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
 409	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
 410		return true;
 411
 412	if (skb->ignore_df)
 413		return false;
 414
 415	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
 416		return false;
 417
 418	return true;
 419}
 420
 421int ip6_forward(struct sk_buff *skb)
 422{
 423	struct dst_entry *dst = skb_dst(skb);
 424	struct ipv6hdr *hdr = ipv6_hdr(skb);
 425	struct inet6_skb_parm *opt = IP6CB(skb);
 426	struct net *net = dev_net(dst->dev);
 427	u32 mtu;
 428
 429	if (net->ipv6.devconf_all->forwarding == 0)
 430		goto error;
 431
 432	if (skb->pkt_type != PACKET_HOST)
 433		goto drop;
 434
 435	if (unlikely(skb->sk))
 436		goto drop;
 437
 438	if (skb_warn_if_lro(skb))
 439		goto drop;
 440
 441	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
 442		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 443				IPSTATS_MIB_INDISCARDS);
 444		goto drop;
 445	}
 446
 447	skb_forward_csum(skb);
 448
 449	/*
 450	 *	We DO NOT make any processing on
 451	 *	RA packets, pushing them to user level AS IS
 452	 *	without ane WARRANTY that application will be able
 453	 *	to interpret them. The reason is that we
 454	 *	cannot make anything clever here.
 455	 *
 456	 *	We are not end-node, so that if packet contains
 457	 *	AH/ESP, we cannot make anything.
 458	 *	Defragmentation also would be mistake, RA packets
 459	 *	cannot be fragmented, because there is no warranty
 460	 *	that different fragments will go along one path. --ANK
 461	 */
 462	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
 463		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
 464			return 0;
 465	}
 466
 467	/*
 468	 *	check and decrement ttl
 469	 */
 470	if (hdr->hop_limit <= 1) {
 471		/* Force OUTPUT device used as source address */
 472		skb->dev = dst->dev;
 473		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
 474		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 475				IPSTATS_MIB_INHDRERRORS);
 476
 477		kfree_skb(skb);
 478		return -ETIMEDOUT;
 479	}
 480
 481	/* XXX: idev->cnf.proxy_ndp? */
 482	if (net->ipv6.devconf_all->proxy_ndp &&
 483	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
 484		int proxied = ip6_forward_proxy_check(skb);
 485		if (proxied > 0)
 486			return ip6_input(skb);
 487		else if (proxied < 0) {
 488			__IP6_INC_STATS(net, ip6_dst_idev(dst),
 489					IPSTATS_MIB_INDISCARDS);
 490			goto drop;
 491		}
 492	}
 493
 494	if (!xfrm6_route_forward(skb)) {
 495		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 496				IPSTATS_MIB_INDISCARDS);
 497		goto drop;
 498	}
 499	dst = skb_dst(skb);
 500
 501	/* IPv6 specs say nothing about it, but it is clear that we cannot
 502	   send redirects to source routed frames.
 503	   We don't send redirects to frames decapsulated from IPsec.
 504	 */
 505	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
 506		struct in6_addr *target = NULL;
 507		struct inet_peer *peer;
 508		struct rt6_info *rt;
 509
 510		/*
 511		 *	incoming and outgoing devices are the same
 512		 *	send a redirect.
 513		 */
 514
 515		rt = (struct rt6_info *) dst;
 516		if (rt->rt6i_flags & RTF_GATEWAY)
 517			target = &rt->rt6i_gateway;
 518		else
 519			target = &hdr->daddr;
 520
 521		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
 522
 523		/* Limit redirects both by destination (here)
 524		   and by source (inside ndisc_send_redirect)
 525		 */
 526		if (inet_peer_xrlim_allow(peer, 1*HZ))
 527			ndisc_send_redirect(skb, target);
 528		if (peer)
 529			inet_putpeer(peer);
 530	} else {
 531		int addrtype = ipv6_addr_type(&hdr->saddr);
 532
 533		/* This check is security critical. */
 534		if (addrtype == IPV6_ADDR_ANY ||
 535		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
 536			goto error;
 537		if (addrtype & IPV6_ADDR_LINKLOCAL) {
 538			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
 539				    ICMPV6_NOT_NEIGHBOUR, 0);
 540			goto error;
 541		}
 542	}
 543
 544	mtu = ip6_dst_mtu_forward(dst);
 545	if (mtu < IPV6_MIN_MTU)
 546		mtu = IPV6_MIN_MTU;
 547
 548	if (ip6_pkt_too_big(skb, mtu)) {
 549		/* Again, force OUTPUT device used as source address */
 550		skb->dev = dst->dev;
 551		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 552		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 553				IPSTATS_MIB_INTOOBIGERRORS);
 554		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 555				IPSTATS_MIB_FRAGFAILS);
 556		kfree_skb(skb);
 557		return -EMSGSIZE;
 558	}
 559
 560	if (skb_cow(skb, dst->dev->hard_header_len)) {
 561		__IP6_INC_STATS(net, ip6_dst_idev(dst),
 562				IPSTATS_MIB_OUTDISCARDS);
 563		goto drop;
 564	}
 565
 566	hdr = ipv6_hdr(skb);
 567
 568	/* Mangling hops number delayed to point after skb COW */
 569
 570	hdr->hop_limit--;
 571
 572	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 573	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 574	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
 575		       net, NULL, skb, skb->dev, dst->dev,
 576		       ip6_forward_finish);
 577
 578error:
 579	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
 580drop:
 581	kfree_skb(skb);
 582	return -EINVAL;
 583}
 584
 585static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 586{
 587	to->pkt_type = from->pkt_type;
 588	to->priority = from->priority;
 589	to->protocol = from->protocol;
 590	skb_dst_drop(to);
 591	skb_dst_set(to, dst_clone(skb_dst(from)));
 592	to->dev = from->dev;
 593	to->mark = from->mark;
 594
 595#ifdef CONFIG_NET_SCHED
 596	to->tc_index = from->tc_index;
 597#endif
 598	nf_copy(to, from);
 599	skb_copy_secmark(to, from);
 600}
 601
 602int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
 603		 int (*output)(struct net *, struct sock *, struct sk_buff *))
 604{
 605	struct sk_buff *frag;
 606	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
 607	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
 608				inet6_sk(skb->sk) : NULL;
 609	struct ipv6hdr *tmp_hdr;
 610	struct frag_hdr *fh;
 611	unsigned int mtu, hlen, left, len;
 612	int hroom, troom;
 613	__be32 frag_id;
 614	int ptr, offset = 0, err = 0;
 615	u8 *prevhdr, nexthdr = 0;
 616
 617	err = ip6_find_1stfragopt(skb, &prevhdr);
 618	if (err < 0)
 619		goto fail;
 620	hlen = err;
 621	nexthdr = *prevhdr;
 622
 623	mtu = ip6_skb_dst_mtu(skb);
 624
 625	/* We must not fragment if the socket is set to force MTU discovery
 626	 * or if the skb it not generated by a local socket.
 627	 */
 628	if (unlikely(!skb->ignore_df && skb->len > mtu))
 629		goto fail_toobig;
 630
 631	if (IP6CB(skb)->frag_max_size) {
 632		if (IP6CB(skb)->frag_max_size > mtu)
 633			goto fail_toobig;
 634
 635		/* don't send fragments larger than what we received */
 636		mtu = IP6CB(skb)->frag_max_size;
 637		if (mtu < IPV6_MIN_MTU)
 638			mtu = IPV6_MIN_MTU;
 639	}
 640
 641	if (np && np->frag_size < mtu) {
 642		if (np->frag_size)
 643			mtu = np->frag_size;
 644	}
 645	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
 646		goto fail_toobig;
 647	mtu -= hlen + sizeof(struct frag_hdr);
 648
 649	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
 650				    &ipv6_hdr(skb)->saddr);
 651
 652	if (skb->ip_summed == CHECKSUM_PARTIAL &&
 653	    (err = skb_checksum_help(skb)))
 654		goto fail;
 655
 656	hroom = LL_RESERVED_SPACE(rt->dst.dev);
 657	if (skb_has_frag_list(skb)) {
 658		unsigned int first_len = skb_pagelen(skb);
 659		struct sk_buff *frag2;
 660
 661		if (first_len - hlen > mtu ||
 662		    ((first_len - hlen) & 7) ||
 663		    skb_cloned(skb) ||
 664		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
 665			goto slow_path;
 666
 667		skb_walk_frags(skb, frag) {
 668			/* Correct geometry. */
 669			if (frag->len > mtu ||
 670			    ((frag->len & 7) && frag->next) ||
 671			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
 672				goto slow_path_clean;
 673
 674			/* Partially cloned skb? */
 675			if (skb_shared(frag))
 676				goto slow_path_clean;
 677
 678			BUG_ON(frag->sk);
 679			if (skb->sk) {
 680				frag->sk = skb->sk;
 681				frag->destructor = sock_wfree;
 682			}
 683			skb->truesize -= frag->truesize;
 684		}
 685
 686		err = 0;
 687		offset = 0;
 688		/* BUILD HEADER */
 689
 690		*prevhdr = NEXTHDR_FRAGMENT;
 691		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 692		if (!tmp_hdr) {
 693			err = -ENOMEM;
 694			goto fail;
 695		}
 696		frag = skb_shinfo(skb)->frag_list;
 697		skb_frag_list_init(skb);
 698
 699		__skb_pull(skb, hlen);
 700		fh = __skb_push(skb, sizeof(struct frag_hdr));
 701		__skb_push(skb, hlen);
 702		skb_reset_network_header(skb);
 703		memcpy(skb_network_header(skb), tmp_hdr, hlen);
 704
 705		fh->nexthdr = nexthdr;
 706		fh->reserved = 0;
 707		fh->frag_off = htons(IP6_MF);
 708		fh->identification = frag_id;
 709
 710		first_len = skb_pagelen(skb);
 711		skb->data_len = first_len - skb_headlen(skb);
 712		skb->len = first_len;
 713		ipv6_hdr(skb)->payload_len = htons(first_len -
 714						   sizeof(struct ipv6hdr));
 715
 716		for (;;) {
 717			/* Prepare header of the next frame,
 718			 * before previous one went down. */
 719			if (frag) {
 720				frag->ip_summed = CHECKSUM_NONE;
 721				skb_reset_transport_header(frag);
 722				fh = __skb_push(frag, sizeof(struct frag_hdr));
 723				__skb_push(frag, hlen);
 724				skb_reset_network_header(frag);
 725				memcpy(skb_network_header(frag), tmp_hdr,
 726				       hlen);
 727				offset += skb->len - hlen - sizeof(struct frag_hdr);
 728				fh->nexthdr = nexthdr;
 729				fh->reserved = 0;
 730				fh->frag_off = htons(offset);
 731				if (frag->next)
 732					fh->frag_off |= htons(IP6_MF);
 733				fh->identification = frag_id;
 734				ipv6_hdr(frag)->payload_len =
 735						htons(frag->len -
 736						      sizeof(struct ipv6hdr));
 737				ip6_copy_metadata(frag, skb);
 738			}
 739
 740			err = output(net, sk, skb);
 741			if (!err)
 742				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 743					      IPSTATS_MIB_FRAGCREATES);
 744
 745			if (err || !frag)
 746				break;
 747
 748			skb = frag;
 749			frag = skb->next;
 750			skb->next = NULL;
 751		}
 752
 753		kfree(tmp_hdr);
 754
 755		if (err == 0) {
 756			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 757				      IPSTATS_MIB_FRAGOKS);
 758			return 0;
 759		}
 760
 761		kfree_skb_list(frag);
 762
 763		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
 764			      IPSTATS_MIB_FRAGFAILS);
 765		return err;
 766
 767slow_path_clean:
 768		skb_walk_frags(skb, frag2) {
 769			if (frag2 == frag)
 770				break;
 771			frag2->sk = NULL;
 772			frag2->destructor = NULL;
 773			skb->truesize += frag2->truesize;
 774		}
 775	}
 776
 777slow_path:
 778	left = skb->len - hlen;		/* Space per frame */
 779	ptr = hlen;			/* Where to start from */
 780
 781	/*
 782	 *	Fragment the datagram.
 783	 */
 784
 785	troom = rt->dst.dev->needed_tailroom;
 786
 787	/*
 788	 *	Keep copying data until we run out.
 789	 */
 790	while (left > 0)	{
 791		u8 *fragnexthdr_offset;
 792
 793		len = left;
 794		/* IF: it doesn't fit, use 'mtu' - the data space left */
 795		if (len > mtu)
 796			len = mtu;
 797		/* IF: we are not sending up to and including the packet end
 798		   then align the next start on an eight byte boundary */
 799		if (len < left)	{
 800			len &= ~7;
 801		}
 802
 803		/* Allocate buffer */
 804		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 805				 hroom + troom, GFP_ATOMIC);
 806		if (!frag) {
 807			err = -ENOMEM;
 808			goto fail;
 809		}
 810
 811		/*
 812		 *	Set up data on packet
 813		 */
 814
 815		ip6_copy_metadata(frag, skb);
 816		skb_reserve(frag, hroom);
 817		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
 818		skb_reset_network_header(frag);
 819		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
 820		frag->transport_header = (frag->network_header + hlen +
 821					  sizeof(struct frag_hdr));
 822
 823		/*
 824		 *	Charge the memory for the fragment to any owner
 825		 *	it might possess
 826		 */
 827		if (skb->sk)
 828			skb_set_owner_w(frag, skb->sk);
 829
 830		/*
 831		 *	Copy the packet header into the new buffer.
 832		 */
 833		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
 834
 835		fragnexthdr_offset = skb_network_header(frag);
 836		fragnexthdr_offset += prevhdr - skb_network_header(skb);
 837		*fragnexthdr_offset = NEXTHDR_FRAGMENT;
 838
 839		/*
 840		 *	Build fragment header.
 841		 */
 842		fh->nexthdr = nexthdr;
 843		fh->reserved = 0;
 844		fh->identification = frag_id;
 845
 846		/*
 847		 *	Copy a block of the IP datagram.
 848		 */
 849		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
 850				     len));
 851		left -= len;
 852
 853		fh->frag_off = htons(offset);
 854		if (left > 0)
 855			fh->frag_off |= htons(IP6_MF);
 856		ipv6_hdr(frag)->payload_len = htons(frag->len -
 857						    sizeof(struct ipv6hdr));
 858
 859		ptr += len;
 860		offset += len;
 861
 862		/*
 863		 *	Put this fragment into the sending queue.
 864		 */
 865		err = output(net, sk, frag);
 866		if (err)
 867			goto fail;
 868
 869		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 870			      IPSTATS_MIB_FRAGCREATES);
 871	}
 872	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 873		      IPSTATS_MIB_FRAGOKS);
 874	consume_skb(skb);
 875	return err;
 876
 877fail_toobig:
 878	if (skb->sk && dst_allfrag(skb_dst(skb)))
 879		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 880
 881	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
 882	err = -EMSGSIZE;
 883
 884fail:
 885	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 886		      IPSTATS_MIB_FRAGFAILS);
 887	kfree_skb(skb);
 888	return err;
 889}
 890
 891static inline int ip6_rt_check(const struct rt6key *rt_key,
 892			       const struct in6_addr *fl_addr,
 893			       const struct in6_addr *addr_cache)
 894{
 895	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
 896		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
 897}
 898
 899static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
 900					  struct dst_entry *dst,
 901					  const struct flowi6 *fl6)
 902{
 903	struct ipv6_pinfo *np = inet6_sk(sk);
 904	struct rt6_info *rt;
 905
 906	if (!dst)
 907		goto out;
 908
 909	if (dst->ops->family != AF_INET6) {
 910		dst_release(dst);
 911		return NULL;
 912	}
 913
 914	rt = (struct rt6_info *)dst;
 915	/* Yes, checking route validity in not connected
 916	 * case is not very simple. Take into account,
 917	 * that we do not support routing by source, TOS,
 918	 * and MSG_DONTROUTE		--ANK (980726)
 919	 *
 920	 * 1. ip6_rt_check(): If route was host route,
 921	 *    check that cached destination is current.
 922	 *    If it is network route, we still may
 923	 *    check its validity using saved pointer
 924	 *    to the last used address: daddr_cache.
 925	 *    We do not want to save whole address now,
 926	 *    (because main consumer of this service
 927	 *    is tcp, which has not this problem),
 928	 *    so that the last trick works only on connected
 929	 *    sockets.
 930	 * 2. oif also should be the same.
 931	 */
 932	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
 933#ifdef CONFIG_IPV6_SUBTREES
 934	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
 935#endif
 936	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
 937	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
 938		dst_release(dst);
 939		dst = NULL;
 940	}
 941
 942out:
 943	return dst;
 944}
 945
 946static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
 947			       struct dst_entry **dst, struct flowi6 *fl6)
 948{
 949#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 950	struct neighbour *n;
 951	struct rt6_info *rt;
 952#endif
 953	int err;
 954	int flags = 0;
 955
 956	/* The correct way to handle this would be to do
 957	 * ip6_route_get_saddr, and then ip6_route_output; however,
 958	 * the route-specific preferred source forces the
 959	 * ip6_route_output call _before_ ip6_route_get_saddr.
 960	 *
 961	 * In source specific routing (no src=any default route),
 962	 * ip6_route_output will fail given src=any saddr, though, so
 963	 * that's why we try it again later.
 964	 */
 965	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
 966		struct rt6_info *rt;
 967		bool had_dst = *dst != NULL;
 968
 969		if (!had_dst)
 970			*dst = ip6_route_output(net, sk, fl6);
 971		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
 972		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
 973					  sk ? inet6_sk(sk)->srcprefs : 0,
 974					  &fl6->saddr);
 975		if (err)
 976			goto out_err_release;
 977
 978		/* If we had an erroneous initial result, pretend it
 979		 * never existed and let the SA-enabled version take
 980		 * over.
 981		 */
 982		if (!had_dst && (*dst)->error) {
 983			dst_release(*dst);
 984			*dst = NULL;
 985		}
 986
 987		if (fl6->flowi6_oif)
 988			flags |= RT6_LOOKUP_F_IFACE;
 989	}
 990
 991	if (!*dst)
 992		*dst = ip6_route_output_flags(net, sk, fl6, flags);
 993
 994	err = (*dst)->error;
 995	if (err)
 996		goto out_err_release;
 997
 998#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
 999	/*
1000	 * Here if the dst entry we've looked up
1001	 * has a neighbour entry that is in the INCOMPLETE
1002	 * state and the src address from the flow is
1003	 * marked as OPTIMISTIC, we release the found
1004	 * dst entry and replace it instead with the
1005	 * dst entry of the nexthop router
1006	 */
1007	rt = (struct rt6_info *) *dst;
1008	rcu_read_lock_bh();
1009	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1010				      rt6_nexthop(rt, &fl6->daddr));
1011	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1012	rcu_read_unlock_bh();
1013
1014	if (err) {
1015		struct inet6_ifaddr *ifp;
1016		struct flowi6 fl_gw6;
1017		int redirect;
1018
1019		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1020				      (*dst)->dev, 1);
1021
1022		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1023		if (ifp)
1024			in6_ifa_put(ifp);
1025
1026		if (redirect) {
1027			/*
1028			 * We need to get the dst entry for the
1029			 * default router instead
1030			 */
1031			dst_release(*dst);
1032			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1033			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1034			*dst = ip6_route_output(net, sk, &fl_gw6);
1035			err = (*dst)->error;
1036			if (err)
1037				goto out_err_release;
1038		}
1039	}
1040#endif
1041	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1042	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1043		err = -EAFNOSUPPORT;
1044		goto out_err_release;
1045	}
1046
1047	return 0;
1048
1049out_err_release:
1050	dst_release(*dst);
1051	*dst = NULL;
1052
1053	if (err == -ENETUNREACH)
1054		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1055	return err;
1056}
1057
1058/**
1059 *	ip6_dst_lookup - perform route lookup on flow
1060 *	@sk: socket which provides route info
1061 *	@dst: pointer to dst_entry * for result
1062 *	@fl6: flow to lookup
1063 *
1064 *	This function performs a route lookup on the given flow.
1065 *
1066 *	It returns zero on success, or a standard errno code on error.
1067 */
1068int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1069		   struct flowi6 *fl6)
1070{
1071	*dst = NULL;
1072	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1073}
1074EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1075
1076/**
1077 *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1078 *	@sk: socket which provides route info
1079 *	@fl6: flow to lookup
1080 *	@final_dst: final destination address for ipsec lookup
1081 *
1082 *	This function performs a route lookup on the given flow.
1083 *
1084 *	It returns a valid dst pointer on success, or a pointer encoded
1085 *	error code.
1086 */
1087struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1088				      const struct in6_addr *final_dst)
1089{
1090	struct dst_entry *dst = NULL;
1091	int err;
1092
1093	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1094	if (err)
1095		return ERR_PTR(err);
1096	if (final_dst)
1097		fl6->daddr = *final_dst;
1098
1099	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1100}
1101EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1102
1103/**
1104 *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1105 *	@sk: socket which provides the dst cache and route info
1106 *	@fl6: flow to lookup
1107 *	@final_dst: final destination address for ipsec lookup
1108 *
1109 *	This function performs a route lookup on the given flow with the
1110 *	possibility of using the cached route in the socket if it is valid.
1111 *	It will take the socket dst lock when operating on the dst cache.
1112 *	As a result, this function can only be used in process context.
1113 *
1114 *	It returns a valid dst pointer on success, or a pointer encoded
1115 *	error code.
1116 */
1117struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1118					 const struct in6_addr *final_dst)
1119{
1120	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1121
1122	dst = ip6_sk_dst_check(sk, dst, fl6);
1123	if (!dst)
1124		dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1125
1126	return dst;
1127}
1128EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1129
1130static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1131					       gfp_t gfp)
1132{
1133	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134}
1135
1136static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1137						gfp_t gfp)
1138{
1139	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140}
1141
1142static void ip6_append_data_mtu(unsigned int *mtu,
1143				int *maxfraglen,
1144				unsigned int fragheaderlen,
1145				struct sk_buff *skb,
1146				struct rt6_info *rt,
1147				unsigned int orig_mtu)
1148{
1149	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1150		if (!skb) {
1151			/* first fragment, reserve header_len */
1152			*mtu = orig_mtu - rt->dst.header_len;
1153
1154		} else {
1155			/*
1156			 * this fragment is not first, the headers
1157			 * space is regarded as data space.
1158			 */
1159			*mtu = orig_mtu;
1160		}
1161		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1162			      + fragheaderlen - sizeof(struct frag_hdr);
1163	}
1164}
1165
1166static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1167			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1168			  struct rt6_info *rt, struct flowi6 *fl6)
1169{
1170	struct ipv6_pinfo *np = inet6_sk(sk);
1171	unsigned int mtu;
1172	struct ipv6_txoptions *opt = ipc6->opt;
1173
1174	/*
1175	 * setup for corking
1176	 */
1177	if (opt) {
1178		if (WARN_ON(v6_cork->opt))
1179			return -EINVAL;
1180
1181		v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1182		if (unlikely(!v6_cork->opt))
1183			return -ENOBUFS;
1184
1185		v6_cork->opt->tot_len = sizeof(*opt);
1186		v6_cork->opt->opt_flen = opt->opt_flen;
1187		v6_cork->opt->opt_nflen = opt->opt_nflen;
1188
1189		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1190						    sk->sk_allocation);
1191		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1192			return -ENOBUFS;
1193
1194		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1195						    sk->sk_allocation);
1196		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1197			return -ENOBUFS;
1198
1199		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1200						   sk->sk_allocation);
1201		if (opt->hopopt && !v6_cork->opt->hopopt)
1202			return -ENOBUFS;
1203
1204		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1205						    sk->sk_allocation);
1206		if (opt->srcrt && !v6_cork->opt->srcrt)
1207			return -ENOBUFS;
1208
1209		/* need source address above miyazawa*/
1210	}
1211	dst_hold(&rt->dst);
1212	cork->base.dst = &rt->dst;
1213	cork->fl.u.ip6 = *fl6;
1214	v6_cork->hop_limit = ipc6->hlimit;
1215	v6_cork->tclass = ipc6->tclass;
1216	if (rt->dst.flags & DST_XFRM_TUNNEL)
1217		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1219	else
1220		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1222	if (np->frag_size < mtu) {
1223		if (np->frag_size)
1224			mtu = np->frag_size;
1225	}
1226	if (mtu < IPV6_MIN_MTU)
1227		return -EINVAL;
1228	cork->base.fragsize = mtu;
1229	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1230		cork->base.flags |= IPCORK_ALLFRAG;
1231	cork->base.length = 0;
1232
1233	return 0;
1234}
1235
1236static int __ip6_append_data(struct sock *sk,
1237			     struct flowi6 *fl6,
1238			     struct sk_buff_head *queue,
1239			     struct inet_cork *cork,
1240			     struct inet6_cork *v6_cork,
1241			     struct page_frag *pfrag,
1242			     int getfrag(void *from, char *to, int offset,
1243					 int len, int odd, struct sk_buff *skb),
1244			     void *from, int length, int transhdrlen,
1245			     unsigned int flags, struct ipcm6_cookie *ipc6,
1246			     const struct sockcm_cookie *sockc)
1247{
1248	struct sk_buff *skb, *skb_prev = NULL;
1249	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1250	int exthdrlen = 0;
1251	int dst_exthdrlen = 0;
1252	int hh_len;
1253	int copy;
1254	int err;
1255	int offset = 0;
1256	__u8 tx_flags = 0;
1257	u32 tskey = 0;
1258	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1259	struct ipv6_txoptions *opt = v6_cork->opt;
1260	int csummode = CHECKSUM_NONE;
1261	unsigned int maxnonfragsize, headersize;
1262
1263	skb = skb_peek_tail(queue);
1264	if (!skb) {
1265		exthdrlen = opt ? opt->opt_flen : 0;
1266		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1267	}
1268
1269	mtu = cork->fragsize;
1270	orig_mtu = mtu;
1271
1272	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1273
1274	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1275			(opt ? opt->opt_nflen : 0);
1276	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1277		     sizeof(struct frag_hdr);
1278
1279	headersize = sizeof(struct ipv6hdr) +
1280		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1281		     (dst_allfrag(&rt->dst) ?
1282		      sizeof(struct frag_hdr) : 0) +
1283		     rt->rt6i_nfheader_len;
1284
1285	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1286	    (sk->sk_protocol == IPPROTO_UDP ||
1287	     sk->sk_protocol == IPPROTO_RAW)) {
1288		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1289				sizeof(struct ipv6hdr));
1290		goto emsgsize;
1291	}
1292
1293	if (ip6_sk_ignore_df(sk))
1294		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1295	else
1296		maxnonfragsize = mtu;
1297
1298	if (cork->length + length > maxnonfragsize - headersize) {
1299emsgsize:
1300		ipv6_local_error(sk, EMSGSIZE, fl6,
1301				 mtu - headersize +
1302				 sizeof(struct ipv6hdr));
1303		return -EMSGSIZE;
1304	}
1305
1306	/* CHECKSUM_PARTIAL only with no extension headers and when
1307	 * we are not going to fragment
1308	 */
1309	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1310	    headersize == sizeof(struct ipv6hdr) &&
1311	    length <= mtu - headersize &&
1312	    !(flags & MSG_MORE) &&
1313	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1314		csummode = CHECKSUM_PARTIAL;
1315
1316	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1317		sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1318		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1319		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1320			tskey = sk->sk_tskey++;
1321	}
1322
1323	/*
1324	 * Let's try using as much space as possible.
1325	 * Use MTU if total length of the message fits into the MTU.
1326	 * Otherwise, we need to reserve fragment header and
1327	 * fragment alignment (= 8-15 octects, in total).
1328	 *
1329	 * Note that we may need to "move" the data from the tail of
1330	 * of the buffer to the new fragment when we split
1331	 * the message.
1332	 *
1333	 * FIXME: It may be fragmented into multiple chunks
1334	 *        at once if non-fragmentable extension headers
1335	 *        are too large.
1336	 * --yoshfuji
1337	 */
1338
1339	cork->length += length;
1340	if (!skb)
1341		goto alloc_new_skb;
1342
1343	while (length > 0) {
1344		/* Check if the remaining data fits into current packet. */
1345		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1346		if (copy < length)
1347			copy = maxfraglen - skb->len;
1348
1349		if (copy <= 0) {
1350			char *data;
1351			unsigned int datalen;
1352			unsigned int fraglen;
1353			unsigned int fraggap;
1354			unsigned int alloclen;
1355alloc_new_skb:
1356			/* There's no room in the current skb */
1357			if (skb)
1358				fraggap = skb->len - maxfraglen;
1359			else
1360				fraggap = 0;
1361			/* update mtu and maxfraglen if necessary */
1362			if (!skb || !skb_prev)
1363				ip6_append_data_mtu(&mtu, &maxfraglen,
1364						    fragheaderlen, skb, rt,
1365						    orig_mtu);
1366
1367			skb_prev = skb;
1368
1369			/*
1370			 * If remaining data exceeds the mtu,
1371			 * we know we need more fragment(s).
1372			 */
1373			datalen = length + fraggap;
1374
1375			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1376				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1377			if ((flags & MSG_MORE) &&
1378			    !(rt->dst.dev->features&NETIF_F_SG))
1379				alloclen = mtu;
1380			else
1381				alloclen = datalen + fragheaderlen;
1382
1383			alloclen += dst_exthdrlen;
1384
1385			if (datalen != length + fraggap) {
1386				/*
1387				 * this is not the last fragment, the trailer
1388				 * space is regarded as data space.
1389				 */
1390				datalen += rt->dst.trailer_len;
1391			}
1392
1393			alloclen += rt->dst.trailer_len;
1394			fraglen = datalen + fragheaderlen;
1395
1396			/*
1397			 * We just reserve space for fragment header.
1398			 * Note: this may be overallocation if the message
1399			 * (without MSG_MORE) fits into the MTU.
1400			 */
1401			alloclen += sizeof(struct frag_hdr);
1402
1403			copy = datalen - transhdrlen - fraggap;
1404			if (copy < 0) {
1405				err = -EINVAL;
1406				goto error;
1407			}
1408			if (transhdrlen) {
1409				skb = sock_alloc_send_skb(sk,
1410						alloclen + hh_len,
1411						(flags & MSG_DONTWAIT), &err);
1412			} else {
1413				skb = NULL;
1414				if (refcount_read(&sk->sk_wmem_alloc) <=
1415				    2 * sk->sk_sndbuf)
1416					skb = sock_wmalloc(sk,
1417							   alloclen + hh_len, 1,
1418							   sk->sk_allocation);
1419				if (unlikely(!skb))
1420					err = -ENOBUFS;
1421			}
1422			if (!skb)
1423				goto error;
1424			/*
1425			 *	Fill in the control structures
1426			 */
1427			skb->protocol = htons(ETH_P_IPV6);
1428			skb->ip_summed = csummode;
1429			skb->csum = 0;
1430			/* reserve for fragmentation and ipsec header */
1431			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1432				    dst_exthdrlen);
1433
1434			/* Only the initial fragment is time stamped */
1435			skb_shinfo(skb)->tx_flags = tx_flags;
1436			tx_flags = 0;
1437			skb_shinfo(skb)->tskey = tskey;
1438			tskey = 0;
1439
1440			/*
1441			 *	Find where to start putting bytes
1442			 */
1443			data = skb_put(skb, fraglen);
1444			skb_set_network_header(skb, exthdrlen);
1445			data += fragheaderlen;
1446			skb->transport_header = (skb->network_header +
1447						 fragheaderlen);
1448			if (fraggap) {
1449				skb->csum = skb_copy_and_csum_bits(
1450					skb_prev, maxfraglen,
1451					data + transhdrlen, fraggap, 0);
1452				skb_prev->csum = csum_sub(skb_prev->csum,
1453							  skb->csum);
1454				data += fraggap;
1455				pskb_trim_unique(skb_prev, maxfraglen);
1456			}
1457			if (copy > 0 &&
1458			    getfrag(from, data + transhdrlen, offset,
1459				    copy, fraggap, skb) < 0) {
1460				err = -EFAULT;
1461				kfree_skb(skb);
1462				goto error;
1463			}
1464
1465			offset += copy;
1466			length -= datalen - fraggap;
1467			transhdrlen = 0;
1468			exthdrlen = 0;
1469			dst_exthdrlen = 0;
1470
1471			if ((flags & MSG_CONFIRM) && !skb_prev)
1472				skb_set_dst_pending_confirm(skb, 1);
1473
1474			/*
1475			 * Put the packet on the pending queue
1476			 */
1477			__skb_queue_tail(queue, skb);
1478			continue;
1479		}
1480
1481		if (copy > length)
1482			copy = length;
1483
1484		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1485			unsigned int off;
1486
1487			off = skb->len;
1488			if (getfrag(from, skb_put(skb, copy),
1489						offset, copy, off, skb) < 0) {
1490				__skb_trim(skb, off);
1491				err = -EFAULT;
1492				goto error;
1493			}
1494		} else {
1495			int i = skb_shinfo(skb)->nr_frags;
1496
1497			err = -ENOMEM;
1498			if (!sk_page_frag_refill(sk, pfrag))
1499				goto error;
1500
1501			if (!skb_can_coalesce(skb, i, pfrag->page,
1502					      pfrag->offset)) {
1503				err = -EMSGSIZE;
1504				if (i == MAX_SKB_FRAGS)
1505					goto error;
1506
1507				__skb_fill_page_desc(skb, i, pfrag->page,
1508						     pfrag->offset, 0);
1509				skb_shinfo(skb)->nr_frags = ++i;
1510				get_page(pfrag->page);
1511			}
1512			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1513			if (getfrag(from,
1514				    page_address(pfrag->page) + pfrag->offset,
1515				    offset, copy, skb->len, skb) < 0)
1516				goto error_efault;
1517
1518			pfrag->offset += copy;
1519			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1520			skb->len += copy;
1521			skb->data_len += copy;
1522			skb->truesize += copy;
1523			refcount_add(copy, &sk->sk_wmem_alloc);
1524		}
1525		offset += copy;
1526		length -= copy;
1527	}
1528
1529	return 0;
1530
1531error_efault:
1532	err = -EFAULT;
1533error:
1534	cork->length -= length;
1535	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1536	return err;
1537}
1538
1539int ip6_append_data(struct sock *sk,
1540		    int getfrag(void *from, char *to, int offset, int len,
1541				int odd, struct sk_buff *skb),
1542		    void *from, int length, int transhdrlen,
1543		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1544		    struct rt6_info *rt, unsigned int flags,
1545		    const struct sockcm_cookie *sockc)
1546{
1547	struct inet_sock *inet = inet_sk(sk);
1548	struct ipv6_pinfo *np = inet6_sk(sk);
1549	int exthdrlen;
1550	int err;
1551
1552	if (flags&MSG_PROBE)
1553		return 0;
1554	if (skb_queue_empty(&sk->sk_write_queue)) {
1555		/*
1556		 * setup for corking
1557		 */
1558		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1559				     ipc6, rt, fl6);
1560		if (err)
1561			return err;
1562
1563		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1564		length += exthdrlen;
1565		transhdrlen += exthdrlen;
1566	} else {
1567		fl6 = &inet->cork.fl.u.ip6;
1568		transhdrlen = 0;
1569	}
1570
1571	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1572				 &np->cork, sk_page_frag(sk), getfrag,
1573				 from, length, transhdrlen, flags, ipc6, sockc);
1574}
1575EXPORT_SYMBOL_GPL(ip6_append_data);
1576
1577static void ip6_cork_release(struct inet_cork_full *cork,
1578			     struct inet6_cork *v6_cork)
1579{
1580	if (v6_cork->opt) {
1581		kfree(v6_cork->opt->dst0opt);
1582		kfree(v6_cork->opt->dst1opt);
1583		kfree(v6_cork->opt->hopopt);
1584		kfree(v6_cork->opt->srcrt);
1585		kfree(v6_cork->opt);
1586		v6_cork->opt = NULL;
1587	}
1588
1589	if (cork->base.dst) {
1590		dst_release(cork->base.dst);
1591		cork->base.dst = NULL;
1592		cork->base.flags &= ~IPCORK_ALLFRAG;
1593	}
1594	memset(&cork->fl, 0, sizeof(cork->fl));
1595}
1596
1597struct sk_buff *__ip6_make_skb(struct sock *sk,
1598			       struct sk_buff_head *queue,
1599			       struct inet_cork_full *cork,
1600			       struct inet6_cork *v6_cork)
1601{
1602	struct sk_buff *skb, *tmp_skb;
1603	struct sk_buff **tail_skb;
1604	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1605	struct ipv6_pinfo *np = inet6_sk(sk);
1606	struct net *net = sock_net(sk);
1607	struct ipv6hdr *hdr;
1608	struct ipv6_txoptions *opt = v6_cork->opt;
1609	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1610	struct flowi6 *fl6 = &cork->fl.u.ip6;
1611	unsigned char proto = fl6->flowi6_proto;
1612
1613	skb = __skb_dequeue(queue);
1614	if (!skb)
1615		goto out;
1616	tail_skb = &(skb_shinfo(skb)->frag_list);
1617
1618	/* move skb->data to ip header from ext header */
1619	if (skb->data < skb_network_header(skb))
1620		__skb_pull(skb, skb_network_offset(skb));
1621	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1622		__skb_pull(tmp_skb, skb_network_header_len(skb));
1623		*tail_skb = tmp_skb;
1624		tail_skb = &(tmp_skb->next);
1625		skb->len += tmp_skb->len;
1626		skb->data_len += tmp_skb->len;
1627		skb->truesize += tmp_skb->truesize;
1628		tmp_skb->destructor = NULL;
1629		tmp_skb->sk = NULL;
1630	}
1631
1632	/* Allow local fragmentation. */
1633	skb->ignore_df = ip6_sk_ignore_df(sk);
1634
1635	*final_dst = fl6->daddr;
1636	__skb_pull(skb, skb_network_header_len(skb));
1637	if (opt && opt->opt_flen)
1638		ipv6_push_frag_opts(skb, opt, &proto);
1639	if (opt && opt->opt_nflen)
1640		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1641
1642	skb_push(skb, sizeof(struct ipv6hdr));
1643	skb_reset_network_header(skb);
1644	hdr = ipv6_hdr(skb);
1645
1646	ip6_flow_hdr(hdr, v6_cork->tclass,
1647		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1648					ip6_autoflowlabel(net, np), fl6));
1649	hdr->hop_limit = v6_cork->hop_limit;
1650	hdr->nexthdr = proto;
1651	hdr->saddr = fl6->saddr;
1652	hdr->daddr = *final_dst;
1653
1654	skb->priority = sk->sk_priority;
1655	skb->mark = sk->sk_mark;
1656
1657	skb_dst_set(skb, dst_clone(&rt->dst));
1658	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1659	if (proto == IPPROTO_ICMPV6) {
1660		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1661
1662		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1663		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1664	}
1665
1666	ip6_cork_release(cork, v6_cork);
1667out:
1668	return skb;
1669}
1670
1671int ip6_send_skb(struct sk_buff *skb)
1672{
1673	struct net *net = sock_net(skb->sk);
1674	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1675	int err;
1676
1677	err = ip6_local_out(net, skb->sk, skb);
1678	if (err) {
1679		if (err > 0)
1680			err = net_xmit_errno(err);
1681		if (err)
1682			IP6_INC_STATS(net, rt->rt6i_idev,
1683				      IPSTATS_MIB_OUTDISCARDS);
1684	}
1685
1686	return err;
1687}
1688
1689int ip6_push_pending_frames(struct sock *sk)
1690{
1691	struct sk_buff *skb;
1692
1693	skb = ip6_finish_skb(sk);
1694	if (!skb)
1695		return 0;
1696
1697	return ip6_send_skb(skb);
1698}
1699EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1700
1701static void __ip6_flush_pending_frames(struct sock *sk,
1702				       struct sk_buff_head *queue,
1703				       struct inet_cork_full *cork,
1704				       struct inet6_cork *v6_cork)
1705{
1706	struct sk_buff *skb;
1707
1708	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1709		if (skb_dst(skb))
1710			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1711				      IPSTATS_MIB_OUTDISCARDS);
1712		kfree_skb(skb);
1713	}
1714
1715	ip6_cork_release(cork, v6_cork);
1716}
1717
1718void ip6_flush_pending_frames(struct sock *sk)
1719{
1720	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1721				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1722}
1723EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1724
1725struct sk_buff *ip6_make_skb(struct sock *sk,
1726			     int getfrag(void *from, char *to, int offset,
1727					 int len, int odd, struct sk_buff *skb),
1728			     void *from, int length, int transhdrlen,
1729			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1730			     struct rt6_info *rt, unsigned int flags,
1731			     const struct sockcm_cookie *sockc)
1732{
1733	struct inet_cork_full cork;
1734	struct inet6_cork v6_cork;
1735	struct sk_buff_head queue;
1736	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1737	int err;
1738
1739	if (flags & MSG_PROBE)
1740		return NULL;
1741
1742	__skb_queue_head_init(&queue);
1743
1744	cork.base.flags = 0;
1745	cork.base.addr = 0;
1746	cork.base.opt = NULL;
1747	cork.base.dst = NULL;
1748	v6_cork.opt = NULL;
1749	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1750	if (err) {
1751		ip6_cork_release(&cork, &v6_cork);
1752		return ERR_PTR(err);
1753	}
1754	if (ipc6->dontfrag < 0)
1755		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1756
1757	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1758				&current->task_frag, getfrag, from,
1759				length + exthdrlen, transhdrlen + exthdrlen,
1760				flags, ipc6, sockc);
1761	if (err) {
1762		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1763		return ERR_PTR(err);
1764	}
1765
1766	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1767}
Configure Feed

Configure Feed