Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
fork
Configure Feed
Select the types of activity you want to include in your feed.
1/*
2 * IPv6 output functions
3 * Linux INET6 implementation
4 *
5 * Authors:
6 * Pedro Roque <roque@di.fc.ul.pt>
7 *
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
29#include <linux/errno.h>
30#include <linux/kernel.h>
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
39#include <linux/module.h>
40#include <linux/slab.h>
41
42#include <linux/bpf-cgroup.h>
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45
46#include <net/sock.h>
47#include <net/snmp.h>
48
49#include <net/ipv6.h>
50#include <net/ndisc.h>
51#include <net/protocol.h>
52#include <net/ip6_route.h>
53#include <net/addrconf.h>
54#include <net/rawv6.h>
55#include <net/icmp.h>
56#include <net/xfrm.h>
57#include <net/checksum.h>
58#include <linux/mroute6.h>
59#include <net/l3mdev.h>
60#include <net/lwtunnel.h>
61
62static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
63{
64 struct dst_entry *dst = skb_dst(skb);
65 struct net_device *dev = dst->dev;
66 struct neighbour *neigh;
67 struct in6_addr *nexthop;
68 int ret;
69
70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 ((mroute6_socket(net, skb) &&
75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 &ipv6_hdr(skb)->saddr))) {
78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80 /* Do not check for IFF_ALLMULTI; multicast routing
81 is not supported in any case.
82 */
83 if (newskb)
84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 net, sk, newskb, NULL, newskb->dev,
86 dev_loopback_xmit);
87
88 if (ipv6_hdr(skb)->hop_limit == 0) {
89 IP6_INC_STATS(net, idev,
90 IPSTATS_MIB_OUTDISCARDS);
91 kfree_skb(skb);
92 return 0;
93 }
94 }
95
96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
97
98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
99 IPV6_ADDR_SCOPE_NODELOCAL &&
100 !(dev->flags & IFF_LOOPBACK)) {
101 kfree_skb(skb);
102 return 0;
103 }
104 }
105
106 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
107 int res = lwtunnel_xmit(skb);
108
109 if (res < 0 || res == LWTUNNEL_XMIT_DONE)
110 return res;
111 }
112
113 rcu_read_lock_bh();
114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
116 if (unlikely(!neigh))
117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
118 if (!IS_ERR(neigh)) {
119 sock_confirm_neigh(skb, neigh);
120 ret = neigh_output(neigh, skb);
121 rcu_read_unlock_bh();
122 return ret;
123 }
124 rcu_read_unlock_bh();
125
126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
127 kfree_skb(skb);
128 return -EINVAL;
129}
130
131static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
132{
133 int ret;
134
135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
136 if (ret) {
137 kfree_skb(skb);
138 return ret;
139 }
140
141#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
142 /* Policy lookup after SNAT yielded a new policy */
143 if (skb_dst(skb)->xfrm) {
144 IPCB(skb)->flags |= IPSKB_REROUTED;
145 return dst_output(net, sk, skb);
146 }
147#endif
148
149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
150 dst_allfrag(skb_dst(skb)) ||
151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
152 return ip6_fragment(net, sk, skb, ip6_finish_output2);
153 else
154 return ip6_finish_output2(net, sk, skb);
155}
156
157int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
158{
159 struct net_device *dev = skb_dst(skb)->dev;
160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
161
162 skb->protocol = htons(ETH_P_IPV6);
163 skb->dev = dev;
164
165 if (unlikely(idev->cnf.disable_ipv6)) {
166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
167 kfree_skb(skb);
168 return 0;
169 }
170
171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
172 net, sk, skb, NULL, dev,
173 ip6_finish_output,
174 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
175}
176
177bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
178{
179 if (!np->autoflowlabel_set)
180 return ip6_default_np_autolabel(net);
181 else
182 return np->autoflowlabel;
183}
184
185/*
186 * xmit an sk_buff (used by TCP, SCTP and DCCP)
187 * Note : socket lock is not held for SYNACK packets, but might be modified
188 * by calls to skb_set_owner_w() and ipv6_local_error(),
189 * which are using proper atomic operations or spinlocks.
190 */
191int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
192 __u32 mark, struct ipv6_txoptions *opt, int tclass)
193{
194 struct net *net = sock_net(sk);
195 const struct ipv6_pinfo *np = inet6_sk(sk);
196 struct in6_addr *first_hop = &fl6->daddr;
197 struct dst_entry *dst = skb_dst(skb);
198 struct ipv6hdr *hdr;
199 u8 proto = fl6->flowi6_proto;
200 int seg_len = skb->len;
201 int hlimit = -1;
202 u32 mtu;
203
204 if (opt) {
205 unsigned int head_room;
206
207 /* First: exthdrs may take lots of space (~8K for now)
208 MAX_HEADER is not enough.
209 */
210 head_room = opt->opt_nflen + opt->opt_flen;
211 seg_len += head_room;
212 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
213
214 if (skb_headroom(skb) < head_room) {
215 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
216 if (!skb2) {
217 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
218 IPSTATS_MIB_OUTDISCARDS);
219 kfree_skb(skb);
220 return -ENOBUFS;
221 }
222 consume_skb(skb);
223 skb = skb2;
224 /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
225 * it is safe to call in our context (socket lock not held)
226 */
227 skb_set_owner_w(skb, (struct sock *)sk);
228 }
229 if (opt->opt_flen)
230 ipv6_push_frag_opts(skb, opt, &proto);
231 if (opt->opt_nflen)
232 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
233 &fl6->saddr);
234 }
235
236 skb_push(skb, sizeof(struct ipv6hdr));
237 skb_reset_network_header(skb);
238 hdr = ipv6_hdr(skb);
239
240 /*
241 * Fill in the IPv6 header
242 */
243 if (np)
244 hlimit = np->hop_limit;
245 if (hlimit < 0)
246 hlimit = ip6_dst_hoplimit(dst);
247
248 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
249 ip6_autoflowlabel(net, np), fl6));
250
251 hdr->payload_len = htons(seg_len);
252 hdr->nexthdr = proto;
253 hdr->hop_limit = hlimit;
254
255 hdr->saddr = fl6->saddr;
256 hdr->daddr = *first_hop;
257
258 skb->protocol = htons(ETH_P_IPV6);
259 skb->priority = sk->sk_priority;
260 skb->mark = mark;
261
262 mtu = dst_mtu(dst);
263 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
264 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
265 IPSTATS_MIB_OUT, skb->len);
266
267 /* if egress device is enslaved to an L3 master device pass the
268 * skb to its handler for processing
269 */
270 skb = l3mdev_ip6_out((struct sock *)sk, skb);
271 if (unlikely(!skb))
272 return 0;
273
274 /* hooks should never assume socket lock is held.
275 * we promote our socket to non const
276 */
277 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
278 net, (struct sock *)sk, skb, NULL, dst->dev,
279 dst_output);
280 }
281
282 skb->dev = dst->dev;
283 /* ipv6_local_error() does not require socket lock,
284 * we promote our socket to non const
285 */
286 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
287
288 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
289 kfree_skb(skb);
290 return -EMSGSIZE;
291}
292EXPORT_SYMBOL(ip6_xmit);
293
294static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
295{
296 struct ip6_ra_chain *ra;
297 struct sock *last = NULL;
298
299 read_lock(&ip6_ra_lock);
300 for (ra = ip6_ra_chain; ra; ra = ra->next) {
301 struct sock *sk = ra->sk;
302 if (sk && ra->sel == sel &&
303 (!sk->sk_bound_dev_if ||
304 sk->sk_bound_dev_if == skb->dev->ifindex)) {
305 if (last) {
306 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
307 if (skb2)
308 rawv6_rcv(last, skb2);
309 }
310 last = sk;
311 }
312 }
313
314 if (last) {
315 rawv6_rcv(last, skb);
316 read_unlock(&ip6_ra_lock);
317 return 1;
318 }
319 read_unlock(&ip6_ra_lock);
320 return 0;
321}
322
323static int ip6_forward_proxy_check(struct sk_buff *skb)
324{
325 struct ipv6hdr *hdr = ipv6_hdr(skb);
326 u8 nexthdr = hdr->nexthdr;
327 __be16 frag_off;
328 int offset;
329
330 if (ipv6_ext_hdr(nexthdr)) {
331 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
332 if (offset < 0)
333 return 0;
334 } else
335 offset = sizeof(struct ipv6hdr);
336
337 if (nexthdr == IPPROTO_ICMPV6) {
338 struct icmp6hdr *icmp6;
339
340 if (!pskb_may_pull(skb, (skb_network_header(skb) +
341 offset + 1 - skb->data)))
342 return 0;
343
344 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
345
346 switch (icmp6->icmp6_type) {
347 case NDISC_ROUTER_SOLICITATION:
348 case NDISC_ROUTER_ADVERTISEMENT:
349 case NDISC_NEIGHBOUR_SOLICITATION:
350 case NDISC_NEIGHBOUR_ADVERTISEMENT:
351 case NDISC_REDIRECT:
352 /* For reaction involving unicast neighbor discovery
353 * message destined to the proxied address, pass it to
354 * input function.
355 */
356 return 1;
357 default:
358 break;
359 }
360 }
361
362 /*
363 * The proxying router can't forward traffic sent to a link-local
364 * address, so signal the sender and discard the packet. This
365 * behavior is clarified by the MIPv6 specification.
366 */
367 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
368 dst_link_failure(skb);
369 return -1;
370 }
371
372 return 0;
373}
374
375static inline int ip6_forward_finish(struct net *net, struct sock *sk,
376 struct sk_buff *skb)
377{
378 return dst_output(net, sk, skb);
379}
380
381unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
382{
383 unsigned int mtu;
384 struct inet6_dev *idev;
385
386 if (dst_metric_locked(dst, RTAX_MTU)) {
387 mtu = dst_metric_raw(dst, RTAX_MTU);
388 if (mtu)
389 return mtu;
390 }
391
392 mtu = IPV6_MIN_MTU;
393 rcu_read_lock();
394 idev = __in6_dev_get(dst->dev);
395 if (idev)
396 mtu = idev->cnf.mtu6;
397 rcu_read_unlock();
398
399 return mtu;
400}
401EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward);
402
403static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
404{
405 if (skb->len <= mtu)
406 return false;
407
408 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
409 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
410 return true;
411
412 if (skb->ignore_df)
413 return false;
414
415 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
416 return false;
417
418 return true;
419}
420
421int ip6_forward(struct sk_buff *skb)
422{
423 struct dst_entry *dst = skb_dst(skb);
424 struct ipv6hdr *hdr = ipv6_hdr(skb);
425 struct inet6_skb_parm *opt = IP6CB(skb);
426 struct net *net = dev_net(dst->dev);
427 u32 mtu;
428
429 if (net->ipv6.devconf_all->forwarding == 0)
430 goto error;
431
432 if (skb->pkt_type != PACKET_HOST)
433 goto drop;
434
435 if (unlikely(skb->sk))
436 goto drop;
437
438 if (skb_warn_if_lro(skb))
439 goto drop;
440
441 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
442 __IP6_INC_STATS(net, ip6_dst_idev(dst),
443 IPSTATS_MIB_INDISCARDS);
444 goto drop;
445 }
446
447 skb_forward_csum(skb);
448
449 /*
450 * We DO NOT make any processing on
451 * RA packets, pushing them to user level AS IS
452 * without ane WARRANTY that application will be able
453 * to interpret them. The reason is that we
454 * cannot make anything clever here.
455 *
456 * We are not end-node, so that if packet contains
457 * AH/ESP, we cannot make anything.
458 * Defragmentation also would be mistake, RA packets
459 * cannot be fragmented, because there is no warranty
460 * that different fragments will go along one path. --ANK
461 */
462 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
463 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
464 return 0;
465 }
466
467 /*
468 * check and decrement ttl
469 */
470 if (hdr->hop_limit <= 1) {
471 /* Force OUTPUT device used as source address */
472 skb->dev = dst->dev;
473 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
474 __IP6_INC_STATS(net, ip6_dst_idev(dst),
475 IPSTATS_MIB_INHDRERRORS);
476
477 kfree_skb(skb);
478 return -ETIMEDOUT;
479 }
480
481 /* XXX: idev->cnf.proxy_ndp? */
482 if (net->ipv6.devconf_all->proxy_ndp &&
483 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
484 int proxied = ip6_forward_proxy_check(skb);
485 if (proxied > 0)
486 return ip6_input(skb);
487 else if (proxied < 0) {
488 __IP6_INC_STATS(net, ip6_dst_idev(dst),
489 IPSTATS_MIB_INDISCARDS);
490 goto drop;
491 }
492 }
493
494 if (!xfrm6_route_forward(skb)) {
495 __IP6_INC_STATS(net, ip6_dst_idev(dst),
496 IPSTATS_MIB_INDISCARDS);
497 goto drop;
498 }
499 dst = skb_dst(skb);
500
501 /* IPv6 specs say nothing about it, but it is clear that we cannot
502 send redirects to source routed frames.
503 We don't send redirects to frames decapsulated from IPsec.
504 */
505 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
506 struct in6_addr *target = NULL;
507 struct inet_peer *peer;
508 struct rt6_info *rt;
509
510 /*
511 * incoming and outgoing devices are the same
512 * send a redirect.
513 */
514
515 rt = (struct rt6_info *) dst;
516 if (rt->rt6i_flags & RTF_GATEWAY)
517 target = &rt->rt6i_gateway;
518 else
519 target = &hdr->daddr;
520
521 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
522
523 /* Limit redirects both by destination (here)
524 and by source (inside ndisc_send_redirect)
525 */
526 if (inet_peer_xrlim_allow(peer, 1*HZ))
527 ndisc_send_redirect(skb, target);
528 if (peer)
529 inet_putpeer(peer);
530 } else {
531 int addrtype = ipv6_addr_type(&hdr->saddr);
532
533 /* This check is security critical. */
534 if (addrtype == IPV6_ADDR_ANY ||
535 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
536 goto error;
537 if (addrtype & IPV6_ADDR_LINKLOCAL) {
538 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
539 ICMPV6_NOT_NEIGHBOUR, 0);
540 goto error;
541 }
542 }
543
544 mtu = ip6_dst_mtu_forward(dst);
545 if (mtu < IPV6_MIN_MTU)
546 mtu = IPV6_MIN_MTU;
547
548 if (ip6_pkt_too_big(skb, mtu)) {
549 /* Again, force OUTPUT device used as source address */
550 skb->dev = dst->dev;
551 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
552 __IP6_INC_STATS(net, ip6_dst_idev(dst),
553 IPSTATS_MIB_INTOOBIGERRORS);
554 __IP6_INC_STATS(net, ip6_dst_idev(dst),
555 IPSTATS_MIB_FRAGFAILS);
556 kfree_skb(skb);
557 return -EMSGSIZE;
558 }
559
560 if (skb_cow(skb, dst->dev->hard_header_len)) {
561 __IP6_INC_STATS(net, ip6_dst_idev(dst),
562 IPSTATS_MIB_OUTDISCARDS);
563 goto drop;
564 }
565
566 hdr = ipv6_hdr(skb);
567
568 /* Mangling hops number delayed to point after skb COW */
569
570 hdr->hop_limit--;
571
572 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
573 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
574 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
575 net, NULL, skb, skb->dev, dst->dev,
576 ip6_forward_finish);
577
578error:
579 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
580drop:
581 kfree_skb(skb);
582 return -EINVAL;
583}
584
585static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
586{
587 to->pkt_type = from->pkt_type;
588 to->priority = from->priority;
589 to->protocol = from->protocol;
590 skb_dst_drop(to);
591 skb_dst_set(to, dst_clone(skb_dst(from)));
592 to->dev = from->dev;
593 to->mark = from->mark;
594
595#ifdef CONFIG_NET_SCHED
596 to->tc_index = from->tc_index;
597#endif
598 nf_copy(to, from);
599 skb_copy_secmark(to, from);
600}
601
602int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
603 int (*output)(struct net *, struct sock *, struct sk_buff *))
604{
605 struct sk_buff *frag;
606 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
607 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
608 inet6_sk(skb->sk) : NULL;
609 struct ipv6hdr *tmp_hdr;
610 struct frag_hdr *fh;
611 unsigned int mtu, hlen, left, len;
612 int hroom, troom;
613 __be32 frag_id;
614 int ptr, offset = 0, err = 0;
615 u8 *prevhdr, nexthdr = 0;
616
617 err = ip6_find_1stfragopt(skb, &prevhdr);
618 if (err < 0)
619 goto fail;
620 hlen = err;
621 nexthdr = *prevhdr;
622
623 mtu = ip6_skb_dst_mtu(skb);
624
625 /* We must not fragment if the socket is set to force MTU discovery
626 * or if the skb it not generated by a local socket.
627 */
628 if (unlikely(!skb->ignore_df && skb->len > mtu))
629 goto fail_toobig;
630
631 if (IP6CB(skb)->frag_max_size) {
632 if (IP6CB(skb)->frag_max_size > mtu)
633 goto fail_toobig;
634
635 /* don't send fragments larger than what we received */
636 mtu = IP6CB(skb)->frag_max_size;
637 if (mtu < IPV6_MIN_MTU)
638 mtu = IPV6_MIN_MTU;
639 }
640
641 if (np && np->frag_size < mtu) {
642 if (np->frag_size)
643 mtu = np->frag_size;
644 }
645 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
646 goto fail_toobig;
647 mtu -= hlen + sizeof(struct frag_hdr);
648
649 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
650 &ipv6_hdr(skb)->saddr);
651
652 if (skb->ip_summed == CHECKSUM_PARTIAL &&
653 (err = skb_checksum_help(skb)))
654 goto fail;
655
656 hroom = LL_RESERVED_SPACE(rt->dst.dev);
657 if (skb_has_frag_list(skb)) {
658 unsigned int first_len = skb_pagelen(skb);
659 struct sk_buff *frag2;
660
661 if (first_len - hlen > mtu ||
662 ((first_len - hlen) & 7) ||
663 skb_cloned(skb) ||
664 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
665 goto slow_path;
666
667 skb_walk_frags(skb, frag) {
668 /* Correct geometry. */
669 if (frag->len > mtu ||
670 ((frag->len & 7) && frag->next) ||
671 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
672 goto slow_path_clean;
673
674 /* Partially cloned skb? */
675 if (skb_shared(frag))
676 goto slow_path_clean;
677
678 BUG_ON(frag->sk);
679 if (skb->sk) {
680 frag->sk = skb->sk;
681 frag->destructor = sock_wfree;
682 }
683 skb->truesize -= frag->truesize;
684 }
685
686 err = 0;
687 offset = 0;
688 /* BUILD HEADER */
689
690 *prevhdr = NEXTHDR_FRAGMENT;
691 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692 if (!tmp_hdr) {
693 err = -ENOMEM;
694 goto fail;
695 }
696 frag = skb_shinfo(skb)->frag_list;
697 skb_frag_list_init(skb);
698
699 __skb_pull(skb, hlen);
700 fh = __skb_push(skb, sizeof(struct frag_hdr));
701 __skb_push(skb, hlen);
702 skb_reset_network_header(skb);
703 memcpy(skb_network_header(skb), tmp_hdr, hlen);
704
705 fh->nexthdr = nexthdr;
706 fh->reserved = 0;
707 fh->frag_off = htons(IP6_MF);
708 fh->identification = frag_id;
709
710 first_len = skb_pagelen(skb);
711 skb->data_len = first_len - skb_headlen(skb);
712 skb->len = first_len;
713 ipv6_hdr(skb)->payload_len = htons(first_len -
714 sizeof(struct ipv6hdr));
715
716 for (;;) {
717 /* Prepare header of the next frame,
718 * before previous one went down. */
719 if (frag) {
720 frag->ip_summed = CHECKSUM_NONE;
721 skb_reset_transport_header(frag);
722 fh = __skb_push(frag, sizeof(struct frag_hdr));
723 __skb_push(frag, hlen);
724 skb_reset_network_header(frag);
725 memcpy(skb_network_header(frag), tmp_hdr,
726 hlen);
727 offset += skb->len - hlen - sizeof(struct frag_hdr);
728 fh->nexthdr = nexthdr;
729 fh->reserved = 0;
730 fh->frag_off = htons(offset);
731 if (frag->next)
732 fh->frag_off |= htons(IP6_MF);
733 fh->identification = frag_id;
734 ipv6_hdr(frag)->payload_len =
735 htons(frag->len -
736 sizeof(struct ipv6hdr));
737 ip6_copy_metadata(frag, skb);
738 }
739
740 err = output(net, sk, skb);
741 if (!err)
742 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743 IPSTATS_MIB_FRAGCREATES);
744
745 if (err || !frag)
746 break;
747
748 skb = frag;
749 frag = skb->next;
750 skb->next = NULL;
751 }
752
753 kfree(tmp_hdr);
754
755 if (err == 0) {
756 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757 IPSTATS_MIB_FRAGOKS);
758 return 0;
759 }
760
761 kfree_skb_list(frag);
762
763 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
764 IPSTATS_MIB_FRAGFAILS);
765 return err;
766
767slow_path_clean:
768 skb_walk_frags(skb, frag2) {
769 if (frag2 == frag)
770 break;
771 frag2->sk = NULL;
772 frag2->destructor = NULL;
773 skb->truesize += frag2->truesize;
774 }
775 }
776
777slow_path:
778 left = skb->len - hlen; /* Space per frame */
779 ptr = hlen; /* Where to start from */
780
781 /*
782 * Fragment the datagram.
783 */
784
785 troom = rt->dst.dev->needed_tailroom;
786
787 /*
788 * Keep copying data until we run out.
789 */
790 while (left > 0) {
791 u8 *fragnexthdr_offset;
792
793 len = left;
794 /* IF: it doesn't fit, use 'mtu' - the data space left */
795 if (len > mtu)
796 len = mtu;
797 /* IF: we are not sending up to and including the packet end
798 then align the next start on an eight byte boundary */
799 if (len < left) {
800 len &= ~7;
801 }
802
803 /* Allocate buffer */
804 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
805 hroom + troom, GFP_ATOMIC);
806 if (!frag) {
807 err = -ENOMEM;
808 goto fail;
809 }
810
811 /*
812 * Set up data on packet
813 */
814
815 ip6_copy_metadata(frag, skb);
816 skb_reserve(frag, hroom);
817 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
818 skb_reset_network_header(frag);
819 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
820 frag->transport_header = (frag->network_header + hlen +
821 sizeof(struct frag_hdr));
822
823 /*
824 * Charge the memory for the fragment to any owner
825 * it might possess
826 */
827 if (skb->sk)
828 skb_set_owner_w(frag, skb->sk);
829
830 /*
831 * Copy the packet header into the new buffer.
832 */
833 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
834
835 fragnexthdr_offset = skb_network_header(frag);
836 fragnexthdr_offset += prevhdr - skb_network_header(skb);
837 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
838
839 /*
840 * Build fragment header.
841 */
842 fh->nexthdr = nexthdr;
843 fh->reserved = 0;
844 fh->identification = frag_id;
845
846 /*
847 * Copy a block of the IP datagram.
848 */
849 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
850 len));
851 left -= len;
852
853 fh->frag_off = htons(offset);
854 if (left > 0)
855 fh->frag_off |= htons(IP6_MF);
856 ipv6_hdr(frag)->payload_len = htons(frag->len -
857 sizeof(struct ipv6hdr));
858
859 ptr += len;
860 offset += len;
861
862 /*
863 * Put this fragment into the sending queue.
864 */
865 err = output(net, sk, frag);
866 if (err)
867 goto fail;
868
869 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
870 IPSTATS_MIB_FRAGCREATES);
871 }
872 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
873 IPSTATS_MIB_FRAGOKS);
874 consume_skb(skb);
875 return err;
876
877fail_toobig:
878 if (skb->sk && dst_allfrag(skb_dst(skb)))
879 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
880
881 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
882 err = -EMSGSIZE;
883
884fail:
885 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
886 IPSTATS_MIB_FRAGFAILS);
887 kfree_skb(skb);
888 return err;
889}
890
891static inline int ip6_rt_check(const struct rt6key *rt_key,
892 const struct in6_addr *fl_addr,
893 const struct in6_addr *addr_cache)
894{
895 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
896 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
897}
898
899static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
900 struct dst_entry *dst,
901 const struct flowi6 *fl6)
902{
903 struct ipv6_pinfo *np = inet6_sk(sk);
904 struct rt6_info *rt;
905
906 if (!dst)
907 goto out;
908
909 if (dst->ops->family != AF_INET6) {
910 dst_release(dst);
911 return NULL;
912 }
913
914 rt = (struct rt6_info *)dst;
915 /* Yes, checking route validity in not connected
916 * case is not very simple. Take into account,
917 * that we do not support routing by source, TOS,
918 * and MSG_DONTROUTE --ANK (980726)
919 *
920 * 1. ip6_rt_check(): If route was host route,
921 * check that cached destination is current.
922 * If it is network route, we still may
923 * check its validity using saved pointer
924 * to the last used address: daddr_cache.
925 * We do not want to save whole address now,
926 * (because main consumer of this service
927 * is tcp, which has not this problem),
928 * so that the last trick works only on connected
929 * sockets.
930 * 2. oif also should be the same.
931 */
932 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
933#ifdef CONFIG_IPV6_SUBTREES
934 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
935#endif
936 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
937 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
938 dst_release(dst);
939 dst = NULL;
940 }
941
942out:
943 return dst;
944}
945
946static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
947 struct dst_entry **dst, struct flowi6 *fl6)
948{
949#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
950 struct neighbour *n;
951 struct rt6_info *rt;
952#endif
953 int err;
954 int flags = 0;
955
956 /* The correct way to handle this would be to do
957 * ip6_route_get_saddr, and then ip6_route_output; however,
958 * the route-specific preferred source forces the
959 * ip6_route_output call _before_ ip6_route_get_saddr.
960 *
961 * In source specific routing (no src=any default route),
962 * ip6_route_output will fail given src=any saddr, though, so
963 * that's why we try it again later.
964 */
965 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
966 struct rt6_info *rt;
967 bool had_dst = *dst != NULL;
968
969 if (!had_dst)
970 *dst = ip6_route_output(net, sk, fl6);
971 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
972 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
973 sk ? inet6_sk(sk)->srcprefs : 0,
974 &fl6->saddr);
975 if (err)
976 goto out_err_release;
977
978 /* If we had an erroneous initial result, pretend it
979 * never existed and let the SA-enabled version take
980 * over.
981 */
982 if (!had_dst && (*dst)->error) {
983 dst_release(*dst);
984 *dst = NULL;
985 }
986
987 if (fl6->flowi6_oif)
988 flags |= RT6_LOOKUP_F_IFACE;
989 }
990
991 if (!*dst)
992 *dst = ip6_route_output_flags(net, sk, fl6, flags);
993
994 err = (*dst)->error;
995 if (err)
996 goto out_err_release;
997
998#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
999 /*
1000 * Here if the dst entry we've looked up
1001 * has a neighbour entry that is in the INCOMPLETE
1002 * state and the src address from the flow is
1003 * marked as OPTIMISTIC, we release the found
1004 * dst entry and replace it instead with the
1005 * dst entry of the nexthop router
1006 */
1007 rt = (struct rt6_info *) *dst;
1008 rcu_read_lock_bh();
1009 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1010 rt6_nexthop(rt, &fl6->daddr));
1011 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
1012 rcu_read_unlock_bh();
1013
1014 if (err) {
1015 struct inet6_ifaddr *ifp;
1016 struct flowi6 fl_gw6;
1017 int redirect;
1018
1019 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1020 (*dst)->dev, 1);
1021
1022 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1023 if (ifp)
1024 in6_ifa_put(ifp);
1025
1026 if (redirect) {
1027 /*
1028 * We need to get the dst entry for the
1029 * default router instead
1030 */
1031 dst_release(*dst);
1032 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1033 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1034 *dst = ip6_route_output(net, sk, &fl_gw6);
1035 err = (*dst)->error;
1036 if (err)
1037 goto out_err_release;
1038 }
1039 }
1040#endif
1041 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1042 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1043 err = -EAFNOSUPPORT;
1044 goto out_err_release;
1045 }
1046
1047 return 0;
1048
1049out_err_release:
1050 dst_release(*dst);
1051 *dst = NULL;
1052
1053 if (err == -ENETUNREACH)
1054 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1055 return err;
1056}
1057
1058/**
1059 * ip6_dst_lookup - perform route lookup on flow
1060 * @sk: socket which provides route info
1061 * @dst: pointer to dst_entry * for result
1062 * @fl6: flow to lookup
1063 *
1064 * This function performs a route lookup on the given flow.
1065 *
1066 * It returns zero on success, or a standard errno code on error.
1067 */
1068int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1069 struct flowi6 *fl6)
1070{
1071 *dst = NULL;
1072 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1073}
1074EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1075
1076/**
1077 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1078 * @sk: socket which provides route info
1079 * @fl6: flow to lookup
1080 * @final_dst: final destination address for ipsec lookup
1081 *
1082 * This function performs a route lookup on the given flow.
1083 *
1084 * It returns a valid dst pointer on success, or a pointer encoded
1085 * error code.
1086 */
1087struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1088 const struct in6_addr *final_dst)
1089{
1090 struct dst_entry *dst = NULL;
1091 int err;
1092
1093 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1094 if (err)
1095 return ERR_PTR(err);
1096 if (final_dst)
1097 fl6->daddr = *final_dst;
1098
1099 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1100}
1101EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1102
1103/**
1104 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1105 * @sk: socket which provides the dst cache and route info
1106 * @fl6: flow to lookup
1107 * @final_dst: final destination address for ipsec lookup
1108 *
1109 * This function performs a route lookup on the given flow with the
1110 * possibility of using the cached route in the socket if it is valid.
1111 * It will take the socket dst lock when operating on the dst cache.
1112 * As a result, this function can only be used in process context.
1113 *
1114 * It returns a valid dst pointer on success, or a pointer encoded
1115 * error code.
1116 */
1117struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1118 const struct in6_addr *final_dst)
1119{
1120 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1121
1122 dst = ip6_sk_dst_check(sk, dst, fl6);
1123 if (!dst)
1124 dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1125
1126 return dst;
1127}
1128EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1129
1130static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1131 gfp_t gfp)
1132{
1133 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1134}
1135
1136static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1137 gfp_t gfp)
1138{
1139 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1140}
1141
1142static void ip6_append_data_mtu(unsigned int *mtu,
1143 int *maxfraglen,
1144 unsigned int fragheaderlen,
1145 struct sk_buff *skb,
1146 struct rt6_info *rt,
1147 unsigned int orig_mtu)
1148{
1149 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1150 if (!skb) {
1151 /* first fragment, reserve header_len */
1152 *mtu = orig_mtu - rt->dst.header_len;
1153
1154 } else {
1155 /*
1156 * this fragment is not first, the headers
1157 * space is regarded as data space.
1158 */
1159 *mtu = orig_mtu;
1160 }
1161 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1162 + fragheaderlen - sizeof(struct frag_hdr);
1163 }
1164}
1165
1166static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1167 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1168 struct rt6_info *rt, struct flowi6 *fl6)
1169{
1170 struct ipv6_pinfo *np = inet6_sk(sk);
1171 unsigned int mtu;
1172 struct ipv6_txoptions *opt = ipc6->opt;
1173
1174 /*
1175 * setup for corking
1176 */
1177 if (opt) {
1178 if (WARN_ON(v6_cork->opt))
1179 return -EINVAL;
1180
1181 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1182 if (unlikely(!v6_cork->opt))
1183 return -ENOBUFS;
1184
1185 v6_cork->opt->tot_len = sizeof(*opt);
1186 v6_cork->opt->opt_flen = opt->opt_flen;
1187 v6_cork->opt->opt_nflen = opt->opt_nflen;
1188
1189 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1190 sk->sk_allocation);
1191 if (opt->dst0opt && !v6_cork->opt->dst0opt)
1192 return -ENOBUFS;
1193
1194 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1195 sk->sk_allocation);
1196 if (opt->dst1opt && !v6_cork->opt->dst1opt)
1197 return -ENOBUFS;
1198
1199 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1200 sk->sk_allocation);
1201 if (opt->hopopt && !v6_cork->opt->hopopt)
1202 return -ENOBUFS;
1203
1204 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1205 sk->sk_allocation);
1206 if (opt->srcrt && !v6_cork->opt->srcrt)
1207 return -ENOBUFS;
1208
1209 /* need source address above miyazawa*/
1210 }
1211 dst_hold(&rt->dst);
1212 cork->base.dst = &rt->dst;
1213 cork->fl.u.ip6 = *fl6;
1214 v6_cork->hop_limit = ipc6->hlimit;
1215 v6_cork->tclass = ipc6->tclass;
1216 if (rt->dst.flags & DST_XFRM_TUNNEL)
1217 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1218 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1219 else
1220 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1222 if (np->frag_size < mtu) {
1223 if (np->frag_size)
1224 mtu = np->frag_size;
1225 }
1226 if (mtu < IPV6_MIN_MTU)
1227 return -EINVAL;
1228 cork->base.fragsize = mtu;
1229 if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1230 cork->base.flags |= IPCORK_ALLFRAG;
1231 cork->base.length = 0;
1232
1233 return 0;
1234}
1235
1236static int __ip6_append_data(struct sock *sk,
1237 struct flowi6 *fl6,
1238 struct sk_buff_head *queue,
1239 struct inet_cork *cork,
1240 struct inet6_cork *v6_cork,
1241 struct page_frag *pfrag,
1242 int getfrag(void *from, char *to, int offset,
1243 int len, int odd, struct sk_buff *skb),
1244 void *from, int length, int transhdrlen,
1245 unsigned int flags, struct ipcm6_cookie *ipc6,
1246 const struct sockcm_cookie *sockc)
1247{
1248 struct sk_buff *skb, *skb_prev = NULL;
1249 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1250 int exthdrlen = 0;
1251 int dst_exthdrlen = 0;
1252 int hh_len;
1253 int copy;
1254 int err;
1255 int offset = 0;
1256 __u8 tx_flags = 0;
1257 u32 tskey = 0;
1258 struct rt6_info *rt = (struct rt6_info *)cork->dst;
1259 struct ipv6_txoptions *opt = v6_cork->opt;
1260 int csummode = CHECKSUM_NONE;
1261 unsigned int maxnonfragsize, headersize;
1262
1263 skb = skb_peek_tail(queue);
1264 if (!skb) {
1265 exthdrlen = opt ? opt->opt_flen : 0;
1266 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1267 }
1268
1269 mtu = cork->fragsize;
1270 orig_mtu = mtu;
1271
1272 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1273
1274 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1275 (opt ? opt->opt_nflen : 0);
1276 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1277 sizeof(struct frag_hdr);
1278
1279 headersize = sizeof(struct ipv6hdr) +
1280 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1281 (dst_allfrag(&rt->dst) ?
1282 sizeof(struct frag_hdr) : 0) +
1283 rt->rt6i_nfheader_len;
1284
1285 if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1286 (sk->sk_protocol == IPPROTO_UDP ||
1287 sk->sk_protocol == IPPROTO_RAW)) {
1288 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1289 sizeof(struct ipv6hdr));
1290 goto emsgsize;
1291 }
1292
1293 if (ip6_sk_ignore_df(sk))
1294 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1295 else
1296 maxnonfragsize = mtu;
1297
1298 if (cork->length + length > maxnonfragsize - headersize) {
1299emsgsize:
1300 ipv6_local_error(sk, EMSGSIZE, fl6,
1301 mtu - headersize +
1302 sizeof(struct ipv6hdr));
1303 return -EMSGSIZE;
1304 }
1305
1306 /* CHECKSUM_PARTIAL only with no extension headers and when
1307 * we are not going to fragment
1308 */
1309 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1310 headersize == sizeof(struct ipv6hdr) &&
1311 length <= mtu - headersize &&
1312 !(flags & MSG_MORE) &&
1313 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1314 csummode = CHECKSUM_PARTIAL;
1315
1316 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1317 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1318 if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1319 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1320 tskey = sk->sk_tskey++;
1321 }
1322
1323 /*
1324 * Let's try using as much space as possible.
1325 * Use MTU if total length of the message fits into the MTU.
1326 * Otherwise, we need to reserve fragment header and
1327 * fragment alignment (= 8-15 octects, in total).
1328 *
1329 * Note that we may need to "move" the data from the tail of
1330 * of the buffer to the new fragment when we split
1331 * the message.
1332 *
1333 * FIXME: It may be fragmented into multiple chunks
1334 * at once if non-fragmentable extension headers
1335 * are too large.
1336 * --yoshfuji
1337 */
1338
1339 cork->length += length;
1340 if (!skb)
1341 goto alloc_new_skb;
1342
1343 while (length > 0) {
1344 /* Check if the remaining data fits into current packet. */
1345 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1346 if (copy < length)
1347 copy = maxfraglen - skb->len;
1348
1349 if (copy <= 0) {
1350 char *data;
1351 unsigned int datalen;
1352 unsigned int fraglen;
1353 unsigned int fraggap;
1354 unsigned int alloclen;
1355alloc_new_skb:
1356 /* There's no room in the current skb */
1357 if (skb)
1358 fraggap = skb->len - maxfraglen;
1359 else
1360 fraggap = 0;
1361 /* update mtu and maxfraglen if necessary */
1362 if (!skb || !skb_prev)
1363 ip6_append_data_mtu(&mtu, &maxfraglen,
1364 fragheaderlen, skb, rt,
1365 orig_mtu);
1366
1367 skb_prev = skb;
1368
1369 /*
1370 * If remaining data exceeds the mtu,
1371 * we know we need more fragment(s).
1372 */
1373 datalen = length + fraggap;
1374
1375 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1376 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1377 if ((flags & MSG_MORE) &&
1378 !(rt->dst.dev->features&NETIF_F_SG))
1379 alloclen = mtu;
1380 else
1381 alloclen = datalen + fragheaderlen;
1382
1383 alloclen += dst_exthdrlen;
1384
1385 if (datalen != length + fraggap) {
1386 /*
1387 * this is not the last fragment, the trailer
1388 * space is regarded as data space.
1389 */
1390 datalen += rt->dst.trailer_len;
1391 }
1392
1393 alloclen += rt->dst.trailer_len;
1394 fraglen = datalen + fragheaderlen;
1395
1396 /*
1397 * We just reserve space for fragment header.
1398 * Note: this may be overallocation if the message
1399 * (without MSG_MORE) fits into the MTU.
1400 */
1401 alloclen += sizeof(struct frag_hdr);
1402
1403 copy = datalen - transhdrlen - fraggap;
1404 if (copy < 0) {
1405 err = -EINVAL;
1406 goto error;
1407 }
1408 if (transhdrlen) {
1409 skb = sock_alloc_send_skb(sk,
1410 alloclen + hh_len,
1411 (flags & MSG_DONTWAIT), &err);
1412 } else {
1413 skb = NULL;
1414 if (refcount_read(&sk->sk_wmem_alloc) <=
1415 2 * sk->sk_sndbuf)
1416 skb = sock_wmalloc(sk,
1417 alloclen + hh_len, 1,
1418 sk->sk_allocation);
1419 if (unlikely(!skb))
1420 err = -ENOBUFS;
1421 }
1422 if (!skb)
1423 goto error;
1424 /*
1425 * Fill in the control structures
1426 */
1427 skb->protocol = htons(ETH_P_IPV6);
1428 skb->ip_summed = csummode;
1429 skb->csum = 0;
1430 /* reserve for fragmentation and ipsec header */
1431 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1432 dst_exthdrlen);
1433
1434 /* Only the initial fragment is time stamped */
1435 skb_shinfo(skb)->tx_flags = tx_flags;
1436 tx_flags = 0;
1437 skb_shinfo(skb)->tskey = tskey;
1438 tskey = 0;
1439
1440 /*
1441 * Find where to start putting bytes
1442 */
1443 data = skb_put(skb, fraglen);
1444 skb_set_network_header(skb, exthdrlen);
1445 data += fragheaderlen;
1446 skb->transport_header = (skb->network_header +
1447 fragheaderlen);
1448 if (fraggap) {
1449 skb->csum = skb_copy_and_csum_bits(
1450 skb_prev, maxfraglen,
1451 data + transhdrlen, fraggap, 0);
1452 skb_prev->csum = csum_sub(skb_prev->csum,
1453 skb->csum);
1454 data += fraggap;
1455 pskb_trim_unique(skb_prev, maxfraglen);
1456 }
1457 if (copy > 0 &&
1458 getfrag(from, data + transhdrlen, offset,
1459 copy, fraggap, skb) < 0) {
1460 err = -EFAULT;
1461 kfree_skb(skb);
1462 goto error;
1463 }
1464
1465 offset += copy;
1466 length -= datalen - fraggap;
1467 transhdrlen = 0;
1468 exthdrlen = 0;
1469 dst_exthdrlen = 0;
1470
1471 if ((flags & MSG_CONFIRM) && !skb_prev)
1472 skb_set_dst_pending_confirm(skb, 1);
1473
1474 /*
1475 * Put the packet on the pending queue
1476 */
1477 __skb_queue_tail(queue, skb);
1478 continue;
1479 }
1480
1481 if (copy > length)
1482 copy = length;
1483
1484 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1485 unsigned int off;
1486
1487 off = skb->len;
1488 if (getfrag(from, skb_put(skb, copy),
1489 offset, copy, off, skb) < 0) {
1490 __skb_trim(skb, off);
1491 err = -EFAULT;
1492 goto error;
1493 }
1494 } else {
1495 int i = skb_shinfo(skb)->nr_frags;
1496
1497 err = -ENOMEM;
1498 if (!sk_page_frag_refill(sk, pfrag))
1499 goto error;
1500
1501 if (!skb_can_coalesce(skb, i, pfrag->page,
1502 pfrag->offset)) {
1503 err = -EMSGSIZE;
1504 if (i == MAX_SKB_FRAGS)
1505 goto error;
1506
1507 __skb_fill_page_desc(skb, i, pfrag->page,
1508 pfrag->offset, 0);
1509 skb_shinfo(skb)->nr_frags = ++i;
1510 get_page(pfrag->page);
1511 }
1512 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1513 if (getfrag(from,
1514 page_address(pfrag->page) + pfrag->offset,
1515 offset, copy, skb->len, skb) < 0)
1516 goto error_efault;
1517
1518 pfrag->offset += copy;
1519 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1520 skb->len += copy;
1521 skb->data_len += copy;
1522 skb->truesize += copy;
1523 refcount_add(copy, &sk->sk_wmem_alloc);
1524 }
1525 offset += copy;
1526 length -= copy;
1527 }
1528
1529 return 0;
1530
1531error_efault:
1532 err = -EFAULT;
1533error:
1534 cork->length -= length;
1535 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1536 return err;
1537}
1538
1539int ip6_append_data(struct sock *sk,
1540 int getfrag(void *from, char *to, int offset, int len,
1541 int odd, struct sk_buff *skb),
1542 void *from, int length, int transhdrlen,
1543 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1544 struct rt6_info *rt, unsigned int flags,
1545 const struct sockcm_cookie *sockc)
1546{
1547 struct inet_sock *inet = inet_sk(sk);
1548 struct ipv6_pinfo *np = inet6_sk(sk);
1549 int exthdrlen;
1550 int err;
1551
1552 if (flags&MSG_PROBE)
1553 return 0;
1554 if (skb_queue_empty(&sk->sk_write_queue)) {
1555 /*
1556 * setup for corking
1557 */
1558 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1559 ipc6, rt, fl6);
1560 if (err)
1561 return err;
1562
1563 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1564 length += exthdrlen;
1565 transhdrlen += exthdrlen;
1566 } else {
1567 fl6 = &inet->cork.fl.u.ip6;
1568 transhdrlen = 0;
1569 }
1570
1571 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1572 &np->cork, sk_page_frag(sk), getfrag,
1573 from, length, transhdrlen, flags, ipc6, sockc);
1574}
1575EXPORT_SYMBOL_GPL(ip6_append_data);
1576
1577static void ip6_cork_release(struct inet_cork_full *cork,
1578 struct inet6_cork *v6_cork)
1579{
1580 if (v6_cork->opt) {
1581 kfree(v6_cork->opt->dst0opt);
1582 kfree(v6_cork->opt->dst1opt);
1583 kfree(v6_cork->opt->hopopt);
1584 kfree(v6_cork->opt->srcrt);
1585 kfree(v6_cork->opt);
1586 v6_cork->opt = NULL;
1587 }
1588
1589 if (cork->base.dst) {
1590 dst_release(cork->base.dst);
1591 cork->base.dst = NULL;
1592 cork->base.flags &= ~IPCORK_ALLFRAG;
1593 }
1594 memset(&cork->fl, 0, sizeof(cork->fl));
1595}
1596
1597struct sk_buff *__ip6_make_skb(struct sock *sk,
1598 struct sk_buff_head *queue,
1599 struct inet_cork_full *cork,
1600 struct inet6_cork *v6_cork)
1601{
1602 struct sk_buff *skb, *tmp_skb;
1603 struct sk_buff **tail_skb;
1604 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1605 struct ipv6_pinfo *np = inet6_sk(sk);
1606 struct net *net = sock_net(sk);
1607 struct ipv6hdr *hdr;
1608 struct ipv6_txoptions *opt = v6_cork->opt;
1609 struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1610 struct flowi6 *fl6 = &cork->fl.u.ip6;
1611 unsigned char proto = fl6->flowi6_proto;
1612
1613 skb = __skb_dequeue(queue);
1614 if (!skb)
1615 goto out;
1616 tail_skb = &(skb_shinfo(skb)->frag_list);
1617
1618 /* move skb->data to ip header from ext header */
1619 if (skb->data < skb_network_header(skb))
1620 __skb_pull(skb, skb_network_offset(skb));
1621 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1622 __skb_pull(tmp_skb, skb_network_header_len(skb));
1623 *tail_skb = tmp_skb;
1624 tail_skb = &(tmp_skb->next);
1625 skb->len += tmp_skb->len;
1626 skb->data_len += tmp_skb->len;
1627 skb->truesize += tmp_skb->truesize;
1628 tmp_skb->destructor = NULL;
1629 tmp_skb->sk = NULL;
1630 }
1631
1632 /* Allow local fragmentation. */
1633 skb->ignore_df = ip6_sk_ignore_df(sk);
1634
1635 *final_dst = fl6->daddr;
1636 __skb_pull(skb, skb_network_header_len(skb));
1637 if (opt && opt->opt_flen)
1638 ipv6_push_frag_opts(skb, opt, &proto);
1639 if (opt && opt->opt_nflen)
1640 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1641
1642 skb_push(skb, sizeof(struct ipv6hdr));
1643 skb_reset_network_header(skb);
1644 hdr = ipv6_hdr(skb);
1645
1646 ip6_flow_hdr(hdr, v6_cork->tclass,
1647 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1648 ip6_autoflowlabel(net, np), fl6));
1649 hdr->hop_limit = v6_cork->hop_limit;
1650 hdr->nexthdr = proto;
1651 hdr->saddr = fl6->saddr;
1652 hdr->daddr = *final_dst;
1653
1654 skb->priority = sk->sk_priority;
1655 skb->mark = sk->sk_mark;
1656
1657 skb_dst_set(skb, dst_clone(&rt->dst));
1658 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1659 if (proto == IPPROTO_ICMPV6) {
1660 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1661
1662 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1663 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1664 }
1665
1666 ip6_cork_release(cork, v6_cork);
1667out:
1668 return skb;
1669}
1670
1671int ip6_send_skb(struct sk_buff *skb)
1672{
1673 struct net *net = sock_net(skb->sk);
1674 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1675 int err;
1676
1677 err = ip6_local_out(net, skb->sk, skb);
1678 if (err) {
1679 if (err > 0)
1680 err = net_xmit_errno(err);
1681 if (err)
1682 IP6_INC_STATS(net, rt->rt6i_idev,
1683 IPSTATS_MIB_OUTDISCARDS);
1684 }
1685
1686 return err;
1687}
1688
1689int ip6_push_pending_frames(struct sock *sk)
1690{
1691 struct sk_buff *skb;
1692
1693 skb = ip6_finish_skb(sk);
1694 if (!skb)
1695 return 0;
1696
1697 return ip6_send_skb(skb);
1698}
1699EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1700
1701static void __ip6_flush_pending_frames(struct sock *sk,
1702 struct sk_buff_head *queue,
1703 struct inet_cork_full *cork,
1704 struct inet6_cork *v6_cork)
1705{
1706 struct sk_buff *skb;
1707
1708 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1709 if (skb_dst(skb))
1710 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1711 IPSTATS_MIB_OUTDISCARDS);
1712 kfree_skb(skb);
1713 }
1714
1715 ip6_cork_release(cork, v6_cork);
1716}
1717
1718void ip6_flush_pending_frames(struct sock *sk)
1719{
1720 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1721 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1722}
1723EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1724
1725struct sk_buff *ip6_make_skb(struct sock *sk,
1726 int getfrag(void *from, char *to, int offset,
1727 int len, int odd, struct sk_buff *skb),
1728 void *from, int length, int transhdrlen,
1729 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1730 struct rt6_info *rt, unsigned int flags,
1731 const struct sockcm_cookie *sockc)
1732{
1733 struct inet_cork_full cork;
1734 struct inet6_cork v6_cork;
1735 struct sk_buff_head queue;
1736 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1737 int err;
1738
1739 if (flags & MSG_PROBE)
1740 return NULL;
1741
1742 __skb_queue_head_init(&queue);
1743
1744 cork.base.flags = 0;
1745 cork.base.addr = 0;
1746 cork.base.opt = NULL;
1747 cork.base.dst = NULL;
1748 v6_cork.opt = NULL;
1749 err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1750 if (err) {
1751 ip6_cork_release(&cork, &v6_cork);
1752 return ERR_PTR(err);
1753 }
1754 if (ipc6->dontfrag < 0)
1755 ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1756
1757 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1758 ¤t->task_frag, getfrag, from,
1759 length + exthdrlen, transhdrlen + exthdrlen,
1760 flags, ipc6, sockc);
1761 if (err) {
1762 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1763 return ERR_PTR(err);
1764 }
1765
1766 return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1767}