jcs's openbsd hax
openbsd
at jcs 1914 lines 47 kB view raw
1/* $OpenBSD: ip_output.c,v 1.416 2025/12/13 00:55:02 jsg Exp $ */ 2/* $NetBSD: ip_output.c,v 1.28 1996/02/13 23:43:07 christos Exp $ */ 3 4/* 5 * Copyright (c) 1982, 1986, 1988, 1990, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94 33 */ 34 35#include "pf.h" 36 37#include <sys/param.h> 38#include <sys/systm.h> 39#include <sys/mbuf.h> 40#include <sys/protosw.h> 41#include <sys/socket.h> 42#include <sys/proc.h> 43 44#include <net/if.h> 45#include <net/if_var.h> 46#include <net/if_enc.h> 47#include <net/route.h> 48 49#include <netinet/in.h> 50#include <netinet/ip.h> 51#include <netinet/in_pcb.h> 52#include <netinet/in_var.h> 53#include <netinet/ip_var.h> 54#include <netinet/ip_icmp.h> 55#include <netinet/tcp.h> 56#include <netinet/udp.h> 57#include <netinet/tcp_timer.h> 58#include <netinet/tcp_var.h> 59#include <netinet/udp_var.h> 60 61#if NPF > 0 62#include <net/pfvar.h> 63#endif 64 65#ifdef IPSEC 66#ifdef ENCDEBUG 67#define DPRINTF(fmt, args...) \ 68 do { \ 69 if (atomic_load_int(&encdebug) \ 70 printf("%s: " fmt "\n", __func__, ## args); \ 71 } while (0) 72#else 73#define DPRINTF(fmt, args...) \ 74 do { } while (0) 75#endif 76#endif /* IPSEC */ 77 78int ip_pcbopts(struct mbuf **, struct mbuf *); 79int ip_multicast_if(struct ip_mreqn *, u_int, unsigned int *); 80int ip_setmoptions(int, struct ip_moptions **, struct mbuf *, u_int); 81void ip_mloopback(struct ifnet *, struct mbuf *, struct sockaddr_in *); 82static u_int16_t in_cksum_phdr(u_int32_t, u_int32_t, u_int32_t); 83void in_delayed_cksum(struct mbuf *); 84 85int ip_output_ipsec_lookup(struct mbuf *m, int hlen, 86 const struct ipsec_level *seclevel, struct tdb **, int ipsecflowinfo); 87void ip_output_ipsec_pmtu_update(struct tdb *, struct route *, struct in_addr, 88 int); 89int ip_output_ipsec_send(struct tdb *, struct mbuf *, struct route *, u_int, 90 int); 91 92/* 93 * IP output. The packet in mbuf chain m contains a skeletal IP 94 * header (with len, off, ttl, proto, tos, src, dst). 95 * The mbuf chain containing the packet will be freed. 96 * The mbuf opt, if present, will not be freed. 97 */ 98int 99ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags, 100 struct ip_moptions *imo, const struct ipsec_level *seclevel, 101 u_int32_t ipsecflowinfo) 102{ 103 struct ip *ip; 104 struct ifnet *ifp = NULL; 105 struct mbuf_list ml; 106 int hlen = sizeof (struct ip); 107 int error = 0; 108 struct route iproute; 109 struct sockaddr_in *dst; 110 struct tdb *tdb = NULL; 111 u_long mtu; 112 u_int orig_rtableid; 113 114 NET_ASSERT_LOCKED(); 115 116#ifdef DIAGNOSTIC 117 if ((m->m_flags & M_PKTHDR) == 0) 118 panic("ip_output no HDR"); 119#endif 120 if (opt) 121 m = ip_insertoptions(m, opt, &hlen); 122 123 ip = mtod(m, struct ip *); 124 125 /* 126 * Fill in IP header. 127 */ 128 if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) { 129 ip->ip_v = IPVERSION; 130 ip->ip_off &= htons(IP_DF); 131 ip->ip_id = htons(ip_randomid()); 132 ip->ip_hl = hlen >> 2; 133 ipstat_inc(ips_localout); 134 } else { 135 hlen = ip->ip_hl << 2; 136 } 137 138 /* 139 * We should not send traffic to 0/8 say both Stevens and RFCs 140 * 5735 section 3 and 1122 sections 3.2.1.3 and 3.3.6. 141 */ 142 if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == 0) { 143 error = ENETUNREACH; 144 goto bad; 145 } 146 147 orig_rtableid = m->m_pkthdr.ph_rtableid; 148#if NPF > 0 149reroute: 150#endif 151 152 /* 153 * Do a route lookup now in case we need the source address to 154 * do an SPD lookup in IPsec; for most packets, the source address 155 * is set at a higher level protocol. ICMPs and other packets 156 * though (e.g., traceroute) have a source address of zeroes. 157 */ 158 if (ro == NULL) { 159 ro = &iproute; 160 ro->ro_rt = NULL; 161 } 162 163 /* 164 * If there is a cached route, check that it is to the same 165 * destination and is still up. If not, free it and try again. 166 */ 167 route_cache(ro, &ip->ip_dst, &ip->ip_src, m->m_pkthdr.ph_rtableid); 168 dst = &ro->ro_dstsin; 169 170 if ((IN_MULTICAST(ip->ip_dst.s_addr) || 171 (ip->ip_dst.s_addr == INADDR_BROADCAST)) && 172 imo != NULL && (ifp = if_get(imo->imo_ifidx)) != NULL) { 173 174 mtu = ifp->if_mtu; 175 if (ip->ip_src.s_addr == INADDR_ANY) { 176 struct in_ifaddr *ia; 177 178 ia = in_ifp2ia(ifp); 179 if (ia != NULL) 180 ip->ip_src = ia->ia_addr.sin_addr; 181 } 182 } else { 183 struct in_ifaddr *ia; 184 185 if (ro->ro_rt == NULL) 186 ro->ro_rt = rtalloc_mpath(&ro->ro_dstsa, 187 &ip->ip_src.s_addr, ro->ro_tableid); 188 189 if (ro->ro_rt == NULL) { 190 ipstat_inc(ips_noroute); 191 error = EHOSTUNREACH; 192 goto bad; 193 } 194 195 ia = ifatoia(ro->ro_rt->rt_ifa); 196 if (ISSET(ro->ro_rt->rt_flags, RTF_LOCAL)) 197 ifp = if_get(rtable_loindex(m->m_pkthdr.ph_rtableid)); 198 else 199 ifp = if_get(ro->ro_rt->rt_ifidx); 200 /* 201 * We aren't using rtisvalid() here because the UP/DOWN state 202 * machine is broken with some Ethernet drivers like em(4). 203 * As a result we might try to use an invalid cached route 204 * entry while an interface is being detached. 205 */ 206 if (ifp == NULL) { 207 ipstat_inc(ips_noroute); 208 error = EHOSTUNREACH; 209 goto bad; 210 } 211 mtu = atomic_load_int(&ro->ro_rt->rt_mtu); 212 if (mtu == 0) 213 mtu = ifp->if_mtu; 214 215 if (ro->ro_rt->rt_flags & RTF_GATEWAY) 216 dst = satosin(ro->ro_rt->rt_gateway); 217 218 /* Set the source IP address */ 219 if (ip->ip_src.s_addr == INADDR_ANY && ia) 220 ip->ip_src = ia->ia_addr.sin_addr; 221 } 222 223#ifdef IPSEC 224 if (ipsec_in_use || seclevel != NULL) { 225 /* Do we have any pending SAs to apply ? */ 226 error = ip_output_ipsec_lookup(m, hlen, seclevel, &tdb, 227 ipsecflowinfo); 228 if (error) { 229 /* Should silently drop packet */ 230 if (error == -EINVAL) 231 error = 0; 232 goto bad; 233 } 234 if (tdb != NULL) { 235 /* 236 * If it needs TCP/UDP hardware-checksumming, do the 237 * computation now. 238 */ 239 in_proto_cksum_out(m, NULL); 240 } 241 } 242#endif /* IPSEC */ 243 244 if (IN_MULTICAST(ip->ip_dst.s_addr) || 245 (ip->ip_dst.s_addr == INADDR_BROADCAST)) { 246 247 m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ? 248 M_BCAST : M_MCAST; 249 250 /* 251 * IP destination address is multicast. Make sure "dst" 252 * still points to the address in "ro". (It may have been 253 * changed to point to a gateway address, above.) 254 */ 255 dst = &ro->ro_dstsin; 256 257 /* 258 * See if the caller provided any multicast options 259 */ 260 if (imo != NULL) 261 ip->ip_ttl = imo->imo_ttl; 262 else 263 ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL; 264 265 /* 266 * if we don't know the outgoing ifp yet, we can't generate 267 * output 268 */ 269 if (!ifp) { 270 ipstat_inc(ips_noroute); 271 error = EHOSTUNREACH; 272 goto bad; 273 } 274 275 /* 276 * Confirm that the outgoing interface supports multicast, 277 * but only if the packet actually is going out on that 278 * interface (i.e., no IPsec is applied). 279 */ 280 if ((((m->m_flags & M_MCAST) && 281 (ifp->if_flags & IFF_MULTICAST) == 0) || 282 ((m->m_flags & M_BCAST) && 283 (ifp->if_flags & IFF_BROADCAST) == 0)) && (tdb == NULL)) { 284 ipstat_inc(ips_noroute); 285 error = ENETUNREACH; 286 goto bad; 287 } 288 289 /* 290 * If source address not specified yet, use address 291 * of outgoing interface. 292 */ 293 if (ip->ip_src.s_addr == INADDR_ANY) { 294 struct in_ifaddr *ia; 295 296 ia = in_ifp2ia(ifp); 297 if (ia != NULL) 298 ip->ip_src = ia->ia_addr.sin_addr; 299 } 300 301 if ((imo == NULL || imo->imo_loop) && 302 in_hasmulti(&ip->ip_dst, ifp)) { 303 /* 304 * If we belong to the destination multicast group 305 * on the outgoing interface, and the caller did not 306 * forbid loopback, loop back a copy. 307 * Can't defer TCP/UDP checksumming, do the 308 * computation now. 309 */ 310 in_proto_cksum_out(m, NULL); 311 ip_mloopback(ifp, m, dst); 312 } 313#ifdef MROUTING 314 else { 315 /* 316 * If we are acting as a multicast router, perform 317 * multicast forwarding as if the packet had just 318 * arrived on the interface to which we are about 319 * to send. The multicast forwarding function 320 * recursively calls this function, using the 321 * IP_FORWARDING flag to prevent infinite recursion. 322 * 323 * Multicasts that are looped back by ip_mloopback(), 324 * above, will be forwarded by the ip_input() routine, 325 * if necessary. 326 */ 327 if (atomic_load_int(&ipmforwarding) && 328 ip_mrouter[ifp->if_rdomain] && 329 (flags & IP_FORWARDING) == 0) { 330 int rv; 331 332 KERNEL_LOCK(); 333 rv = ip_mforward(m, ifp, flags); 334 KERNEL_UNLOCK(); 335 if (rv != 0) 336 goto bad; 337 } 338 } 339#endif 340 /* 341 * Multicasts with a time-to-live of zero may be looped- 342 * back, above, but must not be transmitted on a network. 343 * Also, multicasts addressed to the loopback interface 344 * are not sent -- the above call to ip_mloopback() will 345 * loop back a copy if this host actually belongs to the 346 * destination group on the loopback interface. 347 */ 348 if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) 349 goto bad; 350 } 351 352 /* 353 * Look for broadcast address and verify user is allowed to send 354 * such a packet; if the packet is going in an IPsec tunnel, skip 355 * this check. 356 */ 357 if ((tdb == NULL) && ((dst->sin_addr.s_addr == INADDR_BROADCAST) || 358 (ro && ro->ro_rt && ISSET(ro->ro_rt->rt_flags, RTF_BROADCAST)))) { 359 if ((ifp->if_flags & IFF_BROADCAST) == 0) { 360 error = EADDRNOTAVAIL; 361 goto bad; 362 } 363 if ((flags & IP_ALLOWBROADCAST) == 0) { 364 error = EACCES; 365 goto bad; 366 } 367 368 /* Don't allow broadcast messages to be fragmented */ 369 if (ntohs(ip->ip_len) > ifp->if_mtu) { 370 error = EMSGSIZE; 371 goto bad; 372 } 373 m->m_flags |= M_BCAST; 374 } else 375 m->m_flags &= ~M_BCAST; 376 377 /* 378 * If we're doing Path MTU discovery, we need to set DF unless 379 * the route's MTU is locked. 380 */ 381 if ((flags & IP_MTUDISC) && ro && ro->ro_rt && 382 (ro->ro_rt->rt_locks & RTV_MTU) == 0) 383 ip->ip_off |= htons(IP_DF); 384 385#ifdef IPSEC 386 /* 387 * Check if the packet needs encapsulation. 388 */ 389 if (tdb != NULL) { 390 /* Callee frees mbuf */ 391 error = ip_output_ipsec_send(tdb, m, ro, orig_rtableid, 392 (flags & IP_FORWARDING) ? 1 : 0); 393 goto done; 394 } 395#endif /* IPSEC */ 396 397 /* 398 * Packet filter 399 */ 400#if NPF > 0 401 if (pf_test(AF_INET, (flags & IP_FORWARDING) ? PF_FWD : PF_OUT, 402 ifp, &m) != PF_PASS) { 403 error = EACCES; 404 goto bad; 405 } 406 if (m == NULL) 407 goto done; 408 ip = mtod(m, struct ip *); 409 hlen = ip->ip_hl << 2; 410 if ((m->m_pkthdr.pf.flags & (PF_TAG_REROUTE | PF_TAG_GENERATED)) == 411 (PF_TAG_REROUTE | PF_TAG_GENERATED)) 412 /* already rerun the route lookup, go on */ 413 m->m_pkthdr.pf.flags &= ~(PF_TAG_GENERATED | PF_TAG_REROUTE); 414 else if (m->m_pkthdr.pf.flags & PF_TAG_REROUTE) { 415 /* tag as generated to skip over pf_test on rerun */ 416 m->m_pkthdr.pf.flags |= PF_TAG_GENERATED; 417 if (ro == &iproute) 418 rtfree(ro->ro_rt); 419 ro = NULL; 420 if_put(ifp); /* drop reference since target changed */ 421 ifp = NULL; 422 goto reroute; 423 } 424#endif 425 426#ifdef IPSEC 427 if (ISSET(flags, IP_FORWARDING) && ISSET(flags, IP_FORWARDING_IPSEC) && 428 !ISSET(m->m_pkthdr.ph_tagsset, PACKET_TAG_IPSEC_IN_DONE)) { 429 error = EHOSTUNREACH; 430 goto bad; 431 } 432#endif 433 434 /* 435 * If TSO or small enough for interface, can just send directly. 436 */ 437 error = if_output_tso(ifp, &m, sintosa(dst), ro->ro_rt, mtu); 438 if (error || m == NULL) 439 goto done; 440 441 /* 442 * Too large for interface; fragment if possible. 443 * Must be able to put at least 8 bytes per fragment. 444 */ 445 if (ip->ip_off & htons(IP_DF)) { 446#ifdef IPSEC 447 if (atomic_load_int(&ip_mtudisc)) 448 ipsec_adjust_mtu(m, ifp->if_mtu); 449#endif 450 error = EMSGSIZE; 451#if NPF > 0 452 /* pf changed routing table, use orig rtable for path MTU */ 453 if (ro->ro_tableid != orig_rtableid) { 454 rtfree(ro->ro_rt); 455 ro->ro_tableid = orig_rtableid; 456 ro->ro_rt = icmp_mtudisc_clone( 457 ro->ro_dstsin.sin_addr, ro->ro_tableid, 0); 458 } 459#endif 460 /* 461 * This case can happen if the user changed the MTU 462 * of an interface after enabling IP on it. Because 463 * most netifs don't keep track of routes pointing to 464 * them, there is no way for one to update all its 465 * routes when the MTU is changed. 466 */ 467 if (rtisvalid(ro->ro_rt) && 468 ISSET(ro->ro_rt->rt_flags, RTF_HOST) && 469 !(ro->ro_rt->rt_locks & RTV_MTU)) { 470 u_int rtmtu; 471 472 rtmtu = atomic_load_int(&ro->ro_rt->rt_mtu); 473 if (rtmtu > ifp->if_mtu) { 474 atomic_cas_uint(&ro->ro_rt->rt_mtu, rtmtu, 475 ifp->if_mtu); 476 } 477 } 478 ipstat_inc(ips_cantfrag); 479 goto bad; 480 } 481 482 if ((error = ip_fragment(m, &ml, ifp, mtu)) || 483 (error = if_output_ml(ifp, &ml, sintosa(dst), ro->ro_rt))) 484 goto done; 485 ipstat_inc(ips_fragmented); 486 487done: 488 if (ro == &iproute) 489 rtfree(ro->ro_rt); 490 if_put(ifp); 491#ifdef IPSEC 492 tdb_unref(tdb); 493#endif /* IPSEC */ 494 return (error); 495 496bad: 497 m_freem(m); 498 goto done; 499} 500 501#ifdef IPSEC 502int 503ip_output_ipsec_lookup(struct mbuf *m, int hlen, 504 const struct ipsec_level *seclevel, struct tdb **tdbout, int ipsecflowinfo) 505{ 506 struct m_tag *mtag; 507 struct tdb_ident *tdbi; 508 struct tdb *tdb; 509 struct ipsec_ids *ids = NULL; 510 int error; 511 512 /* Do we have any pending SAs to apply ? */ 513 if (ipsecflowinfo) 514 ids = ipsp_ids_lookup(ipsecflowinfo); 515 error = ipsp_spd_lookup(m, AF_INET, hlen, IPSP_DIRECTION_OUT, 516 NULL, seclevel, &tdb, ids); 517 ipsp_ids_free(ids); 518 if (error || tdb == NULL) { 519 *tdbout = NULL; 520 return error; 521 } 522 /* Loop detection */ 523 for (mtag = m_tag_first(m); mtag != NULL; mtag = m_tag_next(m, mtag)) { 524 if (mtag->m_tag_id != PACKET_TAG_IPSEC_OUT_DONE) 525 continue; 526 tdbi = (struct tdb_ident *)(mtag + 1); 527 if (tdbi->spi == tdb->tdb_spi && 528 tdbi->proto == tdb->tdb_sproto && 529 tdbi->rdomain == tdb->tdb_rdomain && 530 !memcmp(&tdbi->dst, &tdb->tdb_dst, 531 sizeof(union sockaddr_union))) { 532 /* no IPsec needed */ 533 tdb_unref(tdb); 534 *tdbout = NULL; 535 return 0; 536 } 537 } 538 *tdbout = tdb; 539 return 0; 540} 541 542void 543ip_output_ipsec_pmtu_update(struct tdb *tdb, struct route *ro, 544 struct in_addr dst, int rtableid) 545{ 546 struct rtentry *rt = NULL; 547 int rt_mtucloned = 0; 548 int transportmode = (tdb->tdb_dst.sa.sa_family == AF_INET) && 549 (tdb->tdb_dst.sin.sin_addr.s_addr == dst.s_addr); 550 551 /* Find a host route to store the mtu in */ 552 if (ro != NULL) 553 rt = ro->ro_rt; 554 /* but don't add a PMTU route for transport mode SAs */ 555 if (transportmode) 556 rt = NULL; 557 else if (rt == NULL || (rt->rt_flags & RTF_HOST) == 0) { 558 rt = icmp_mtudisc_clone(dst, rtableid, 1); 559 rt_mtucloned = 1; 560 } 561 DPRINTF("spi %08x mtu %d rt %p cloned %d", 562 ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned); 563 if (rt != NULL) { 564 atomic_store_int(&rt->rt_mtu, tdb->tdb_mtu); 565 if (ro != NULL && ro->ro_rt != NULL) { 566 rtfree(ro->ro_rt); 567 ro->ro_tableid = rtableid; 568 ro->ro_rt = rtalloc(&ro->ro_dstsa, RT_RESOLVE, 569 rtableid); 570 } 571 if (rt_mtucloned) 572 rtfree(rt); 573 } 574} 575 576int 577ip_output_ipsec_send(struct tdb *tdb, struct mbuf *m, struct route *ro, 578 u_int rtableid, int fwd) 579{ 580 struct mbuf_list ml; 581 struct ifnet *encif = NULL; 582 struct ip *ip; 583 struct in_addr dst; 584 u_int len; 585 int tso = 0, ip_mtudisc_local = atomic_load_int(&ip_mtudisc); 586 int error = 0; 587 588#if NPF > 0 589 /* 590 * Packet filter 591 */ 592 if ((encif = enc_getif(tdb->tdb_rdomain, tdb->tdb_tap)) == NULL || 593 pf_test(AF_INET, fwd ? PF_FWD : PF_OUT, encif, &m) != PF_PASS) { 594 m_freem(m); 595 return EACCES; 596 } 597 if (m == NULL) 598 return 0; 599 /* 600 * PF_TAG_REROUTE handling or not... 601 * Packet is entering IPsec so the routing is 602 * already overruled by the IPsec policy. 603 * Until now the change was not reconsidered. 604 * What's the behaviour? 605 */ 606#endif 607 608 /* Check if we can chop the TCP packet */ 609 ip = mtod(m, struct ip *); 610 if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && 611 m->m_pkthdr.ph_mss <= tdb->tdb_mtu) { 612 tso = 1; 613 len = m->m_pkthdr.ph_mss; 614 } else 615 len = ntohs(ip->ip_len); 616 617 /* Check if we are allowed to fragment */ 618 dst = ip->ip_dst; 619 if (ip_mtudisc_local && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu && 620 len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) { 621 ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid); 622 ipsec_adjust_mtu(m, tdb->tdb_mtu); 623 m_freem(m); 624 return EMSGSIZE; 625 } 626 /* propagate IP_DF for v4-over-v6 */ 627 if (ip_mtudisc_local && ip->ip_off & htons(IP_DF)) 628 SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); 629 630 /* 631 * Clear these -- they'll be set in the recursive invocation 632 * as needed. 633 */ 634 m->m_flags &= ~(M_MCAST | M_BCAST); 635 636 if (tso) { 637 error = tcp_softtso_chop(&ml, m, encif, len); 638 if (error) 639 goto done; 640 } else { 641 CLR(m->m_pkthdr.csum_flags, M_TCP_TSO); 642 in_proto_cksum_out(m, encif); 643 ml_init(&ml); 644 ml_enqueue(&ml, m); 645 } 646 647 KERNEL_LOCK(); 648 while ((m = ml_dequeue(&ml)) != NULL) { 649 /* Callee frees mbuf */ 650 error = ipsp_process_packet(m, tdb, AF_INET, 0, 651 IPSP_DF_INHERIT); 652 if (error) 653 break; 654 } 655 KERNEL_UNLOCK(); 656 done: 657 if (error) { 658 ml_purge(&ml); 659 ipsecstat_inc(ipsec_odrops); 660 tdbstat_inc(tdb, tdb_odrops); 661 } 662 if (!error && tso) 663 tcpstat_inc(tcps_outswtso); 664 if (ip_mtudisc_local && error == EMSGSIZE) 665 ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid); 666 return error; 667} 668#endif /* IPSEC */ 669 670int 671ip_fragment(struct mbuf *m0, struct mbuf_list *ml, struct ifnet *ifp, 672 u_long mtu) 673{ 674 struct ip *ip; 675 int firstlen, hlen, tlen, len, off; 676 int error; 677 678 ml_init(ml); 679 ml_enqueue(ml, m0); 680 681 ip = mtod(m0, struct ip *); 682 hlen = ip->ip_hl << 2; 683 tlen = m0->m_pkthdr.len; 684 len = (mtu - hlen) &~ 7; 685 if (len < 8) { 686 error = EMSGSIZE; 687 goto bad; 688 } 689 firstlen = len; 690 691 /* 692 * If we are doing fragmentation, we can't defer TCP/UDP 693 * checksumming; compute the checksum and clear the flag. 694 */ 695 in_proto_cksum_out(m0, NULL); 696 697 /* 698 * Loop through length of payload after first fragment, 699 * make new header and copy data of each part and link onto chain. 700 */ 701 for (off = hlen + firstlen; off < tlen; off += len) { 702 struct mbuf *m; 703 struct ip *mhip; 704 int mhlen; 705 706 MGETHDR(m, M_DONTWAIT, MT_HEADER); 707 if (m == NULL) { 708 error = ENOBUFS; 709 goto bad; 710 } 711 ml_enqueue(ml, m); 712 if ((error = m_dup_pkthdr(m, m0, M_DONTWAIT)) != 0) 713 goto bad; 714 m->m_data += max_linkhdr; 715 mhip = mtod(m, struct ip *); 716 *mhip = *ip; 717 if (hlen > sizeof(struct ip)) { 718 mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip); 719 mhip->ip_hl = mhlen >> 2; 720 } else 721 mhlen = sizeof(struct ip); 722 m->m_len = mhlen; 723 724 mhip->ip_off = ((off - hlen) >> 3) + 725 (ntohs(ip->ip_off) & ~IP_MF); 726 if (ip->ip_off & htons(IP_MF)) 727 mhip->ip_off |= IP_MF; 728 if (off + len >= tlen) 729 len = tlen - off; 730 else 731 mhip->ip_off |= IP_MF; 732 mhip->ip_off = htons(mhip->ip_off); 733 734 m->m_pkthdr.len = mhlen + len; 735 mhip->ip_len = htons(m->m_pkthdr.len); 736 m->m_next = m_copym(m0, off, len, M_NOWAIT); 737 if (m->m_next == NULL) { 738 error = ENOBUFS; 739 goto bad; 740 } 741 742 in_hdr_cksum_out(m, ifp); 743 } 744 745 /* 746 * Update first fragment by trimming what's been copied out 747 * and updating header, then send each fragment (in order). 748 */ 749 if (hlen + firstlen < tlen) { 750 m_adj(m0, hlen + firstlen - tlen); 751 ip->ip_off |= htons(IP_MF); 752 } 753 ip->ip_len = htons(m0->m_pkthdr.len); 754 755 in_hdr_cksum_out(m0, ifp); 756 757 ipstat_add(ips_ofragments, ml_len(ml)); 758 return (0); 759 760bad: 761 ipstat_inc(ips_odropped); 762 ml_purge(ml); 763 return (error); 764} 765 766/* 767 * Insert IP options into preformed packet. 768 * Adjust IP destination as required for IP source routing, 769 * as indicated by a non-zero in_addr at the start of the options. 770 */ 771struct mbuf * 772ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen) 773{ 774 struct ipoption *p = mtod(opt, struct ipoption *); 775 struct mbuf *n; 776 struct ip *ip = mtod(m, struct ip *); 777 unsigned int optlen; 778 779 optlen = opt->m_len - sizeof(p->ipopt_dst); 780 if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET) 781 return (m); /* XXX should fail */ 782 783 /* check if options will fit to IP header */ 784 if ((optlen + sizeof(struct ip)) > (0x0f << 2)) { 785 *phlen = sizeof(struct ip); 786 return (m); 787 } 788 789 if (p->ipopt_dst.s_addr) 790 ip->ip_dst = p->ipopt_dst; 791 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) { 792 MGETHDR(n, M_DONTWAIT, MT_HEADER); 793 if (n == NULL) 794 return (m); 795 M_MOVE_HDR(n, m); 796 n->m_pkthdr.len += optlen; 797 m->m_len -= sizeof(struct ip); 798 m->m_data += sizeof(struct ip); 799 n->m_next = m; 800 m = n; 801 m->m_len = optlen + sizeof(struct ip); 802 m->m_data += max_linkhdr; 803 memcpy(mtod(m, caddr_t), ip, sizeof(struct ip)); 804 } else { 805 m->m_data -= optlen; 806 m->m_len += optlen; 807 m->m_pkthdr.len += optlen; 808 memmove(mtod(m, caddr_t), (caddr_t)ip, sizeof(struct ip)); 809 } 810 ip = mtod(m, struct ip *); 811 memcpy(ip + 1, p->ipopt_list, optlen); 812 *phlen = sizeof(struct ip) + optlen; 813 ip->ip_len = htons(ntohs(ip->ip_len) + optlen); 814 return (m); 815} 816 817/* 818 * Copy options from ip to jp, 819 * omitting those not copied during fragmentation. 820 */ 821int 822ip_optcopy(struct ip *ip, struct ip *jp) 823{ 824 u_char *cp, *dp; 825 int opt, optlen, cnt; 826 827 cp = (u_char *)(ip + 1); 828 dp = (u_char *)(jp + 1); 829 cnt = (ip->ip_hl << 2) - sizeof (struct ip); 830 for (; cnt > 0; cnt -= optlen, cp += optlen) { 831 opt = cp[0]; 832 if (opt == IPOPT_EOL) 833 break; 834 if (opt == IPOPT_NOP) { 835 /* Preserve for IP mcast tunnel's LSRR alignment. */ 836 *dp++ = IPOPT_NOP; 837 optlen = 1; 838 continue; 839 } 840#ifdef DIAGNOSTIC 841 if (cnt < IPOPT_OLEN + sizeof(*cp)) 842 panic("malformed IPv4 option passed to ip_optcopy"); 843#endif 844 optlen = cp[IPOPT_OLEN]; 845#ifdef DIAGNOSTIC 846 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 847 panic("malformed IPv4 option passed to ip_optcopy"); 848#endif 849 /* bogus lengths should have been caught by ip_dooptions */ 850 if (optlen > cnt) 851 optlen = cnt; 852 if (IPOPT_COPIED(opt)) { 853 memcpy(dp, cp, optlen); 854 dp += optlen; 855 } 856 } 857 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++) 858 *dp++ = IPOPT_EOL; 859 return (optlen); 860} 861 862/* 863 * IP socket option processing. 864 */ 865int 866ip_ctloutput(int op, struct socket *so, int level, int optname, 867 struct mbuf *m) 868{ 869 struct inpcb *inp = sotoinpcb(so); 870 int optval = 0; 871 struct proc *p = curproc; /* XXX */ 872 int error = 0; 873 u_int rtableid, rtid = 0; 874 875 if (level != IPPROTO_IP) 876 return (EINVAL); 877 878 rtableid = p->p_p->ps_rtableid; 879 880 switch (op) { 881 case PRCO_SETOPT: 882 switch (optname) { 883 case IP_OPTIONS: 884 return (ip_pcbopts(&inp->inp_options, m)); 885 886 case IP_TOS: 887 case IP_TTL: 888 case IP_MINTTL: 889 case IP_RECVOPTS: 890 case IP_RECVRETOPTS: 891 case IP_RECVDSTADDR: 892 case IP_RECVIF: 893 case IP_RECVTTL: 894 case IP_RECVDSTPORT: 895 case IP_RECVRTABLE: 896 case IP_IPSECFLOWINFO: 897 if (m == NULL || m->m_len != sizeof(int)) 898 error = EINVAL; 899 else { 900 optval = *mtod(m, int *); 901 switch (optname) { 902 903 case IP_TOS: 904 inp->inp_ip.ip_tos = optval; 905 break; 906 907 case IP_TTL: 908 if (optval > 0 && optval <= MAXTTL) 909 inp->inp_ip.ip_ttl = optval; 910 else if (optval == -1) 911 inp->inp_ip.ip_ttl = 912 atomic_load_int(&ip_defttl); 913 else 914 error = EINVAL; 915 break; 916 917 case IP_MINTTL: 918 if (optval >= 0 && optval <= MAXTTL) 919 inp->inp_ip_minttl = optval; 920 else 921 error = EINVAL; 922 break; 923#define OPTSET(bit) \ 924 if (optval) \ 925 inp->inp_flags |= bit; \ 926 else \ 927 inp->inp_flags &= ~bit; 928 929 case IP_RECVOPTS: 930 OPTSET(INP_RECVOPTS); 931 break; 932 933 case IP_RECVRETOPTS: 934 OPTSET(INP_RECVRETOPTS); 935 break; 936 937 case IP_RECVDSTADDR: 938 OPTSET(INP_RECVDSTADDR); 939 break; 940 case IP_RECVIF: 941 OPTSET(INP_RECVIF); 942 break; 943 case IP_RECVTTL: 944 OPTSET(INP_RECVTTL); 945 break; 946 case IP_RECVDSTPORT: 947 OPTSET(INP_RECVDSTPORT); 948 break; 949 case IP_RECVRTABLE: 950 OPTSET(INP_RECVRTABLE); 951 break; 952 case IP_IPSECFLOWINFO: 953 OPTSET(INP_IPSECFLOWINFO); 954 break; 955 } 956 } 957 break; 958#undef OPTSET 959 960 case IP_MULTICAST_IF: 961 case IP_MULTICAST_TTL: 962 case IP_MULTICAST_LOOP: 963 case IP_ADD_MEMBERSHIP: 964 case IP_DROP_MEMBERSHIP: 965 error = ip_setmoptions(optname, &inp->inp_moptions, m, 966 inp->inp_rtableid); 967 break; 968 969 case IP_PORTRANGE: 970 if (m == NULL || m->m_len != sizeof(int)) 971 error = EINVAL; 972 else { 973 optval = *mtod(m, int *); 974 975 switch (optval) { 976 977 case IP_PORTRANGE_DEFAULT: 978 inp->inp_flags &= ~(INP_LOWPORT); 979 inp->inp_flags &= ~(INP_HIGHPORT); 980 break; 981 982 case IP_PORTRANGE_HIGH: 983 inp->inp_flags &= ~(INP_LOWPORT); 984 inp->inp_flags |= INP_HIGHPORT; 985 break; 986 987 case IP_PORTRANGE_LOW: 988 inp->inp_flags &= ~(INP_HIGHPORT); 989 inp->inp_flags |= INP_LOWPORT; 990 break; 991 992 default: 993 994 error = EINVAL; 995 break; 996 } 997 } 998 break; 999 case IP_AUTH_LEVEL: 1000 case IP_ESP_TRANS_LEVEL: 1001 case IP_ESP_NETWORK_LEVEL: 1002 case IP_IPCOMP_LEVEL: 1003#ifndef IPSEC 1004 error = EOPNOTSUPP; 1005#else 1006 if (m == NULL || m->m_len != sizeof(int)) { 1007 error = EINVAL; 1008 break; 1009 } 1010 optval = *mtod(m, int *); 1011 1012 if (optval < IPSEC_LEVEL_BYPASS || 1013 optval > IPSEC_LEVEL_UNIQUE) { 1014 error = EINVAL; 1015 break; 1016 } 1017 1018 switch (optname) { 1019 case IP_AUTH_LEVEL: 1020 if (optval < IPSEC_AUTH_LEVEL_DEFAULT && 1021 suser(p)) { 1022 error = EACCES; 1023 break; 1024 } 1025 inp->inp_seclevel.sl_auth = optval; 1026 break; 1027 1028 case IP_ESP_TRANS_LEVEL: 1029 if (optval < IPSEC_ESP_TRANS_LEVEL_DEFAULT && 1030 suser(p)) { 1031 error = EACCES; 1032 break; 1033 } 1034 inp->inp_seclevel.sl_esp_trans = optval; 1035 break; 1036 1037 case IP_ESP_NETWORK_LEVEL: 1038 if (optval < IPSEC_ESP_NETWORK_LEVEL_DEFAULT && 1039 suser(p)) { 1040 error = EACCES; 1041 break; 1042 } 1043 inp->inp_seclevel.sl_esp_network = optval; 1044 break; 1045 case IP_IPCOMP_LEVEL: 1046 if (optval < IPSEC_IPCOMP_LEVEL_DEFAULT && 1047 suser(p)) { 1048 error = EACCES; 1049 break; 1050 } 1051 inp->inp_seclevel.sl_ipcomp = optval; 1052 break; 1053 } 1054#endif 1055 break; 1056 1057 case IP_IPSEC_LOCAL_ID: 1058 case IP_IPSEC_REMOTE_ID: 1059 error = EOPNOTSUPP; 1060 break; 1061 case SO_RTABLE: 1062 if (m == NULL || m->m_len < sizeof(u_int)) { 1063 error = EINVAL; 1064 break; 1065 } 1066 rtid = *mtod(m, u_int *); 1067 if (inp->inp_rtableid == rtid) 1068 break; 1069 /* needs privileges to switch when already set */ 1070 if (rtableid != rtid && rtableid != 0 && 1071 (error = suser(p)) != 0) 1072 break; 1073 error = in_pcbset_rtableid(inp, rtid); 1074 break; 1075 case IP_PIPEX: 1076 if (m != NULL && m->m_len == sizeof(int)) 1077 inp->inp_pipex = *mtod(m, int *); 1078 else 1079 error = EINVAL; 1080 break; 1081 1082 default: 1083 error = ENOPROTOOPT; 1084 break; 1085 } 1086 break; 1087 1088 case PRCO_GETOPT: 1089 switch (optname) { 1090 case IP_OPTIONS: 1091 case IP_RETOPTS: 1092 if (inp->inp_options) { 1093 m->m_len = inp->inp_options->m_len; 1094 memcpy(mtod(m, caddr_t), 1095 mtod(inp->inp_options, caddr_t), m->m_len); 1096 } else 1097 m->m_len = 0; 1098 break; 1099 1100 case IP_TOS: 1101 case IP_TTL: 1102 case IP_MINTTL: 1103 case IP_RECVOPTS: 1104 case IP_RECVRETOPTS: 1105 case IP_RECVDSTADDR: 1106 case IP_RECVIF: 1107 case IP_RECVTTL: 1108 case IP_RECVDSTPORT: 1109 case IP_RECVRTABLE: 1110 case IP_IPSECFLOWINFO: 1111 case IP_IPDEFTTL: 1112 m->m_len = sizeof(int); 1113 switch (optname) { 1114 1115 case IP_TOS: 1116 optval = inp->inp_ip.ip_tos; 1117 break; 1118 1119 case IP_TTL: 1120 optval = inp->inp_ip.ip_ttl; 1121 break; 1122 1123 case IP_MINTTL: 1124 optval = inp->inp_ip_minttl; 1125 break; 1126 1127 case IP_IPDEFTTL: 1128 optval = atomic_load_int(&ip_defttl); 1129 break; 1130 1131#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0) 1132 1133 case IP_RECVOPTS: 1134 optval = OPTBIT(INP_RECVOPTS); 1135 break; 1136 1137 case IP_RECVRETOPTS: 1138 optval = OPTBIT(INP_RECVRETOPTS); 1139 break; 1140 1141 case IP_RECVDSTADDR: 1142 optval = OPTBIT(INP_RECVDSTADDR); 1143 break; 1144 case IP_RECVIF: 1145 optval = OPTBIT(INP_RECVIF); 1146 break; 1147 case IP_RECVTTL: 1148 optval = OPTBIT(INP_RECVTTL); 1149 break; 1150 case IP_RECVDSTPORT: 1151 optval = OPTBIT(INP_RECVDSTPORT); 1152 break; 1153 case IP_RECVRTABLE: 1154 optval = OPTBIT(INP_RECVRTABLE); 1155 break; 1156 case IP_IPSECFLOWINFO: 1157 optval = OPTBIT(INP_IPSECFLOWINFO); 1158 break; 1159 } 1160 *mtod(m, int *) = optval; 1161 break; 1162 1163 case IP_MULTICAST_IF: 1164 case IP_MULTICAST_TTL: 1165 case IP_MULTICAST_LOOP: 1166 case IP_ADD_MEMBERSHIP: 1167 case IP_DROP_MEMBERSHIP: 1168 error = ip_getmoptions(optname, inp->inp_moptions, m); 1169 break; 1170 1171 case IP_PORTRANGE: 1172 m->m_len = sizeof(int); 1173 1174 if (inp->inp_flags & INP_HIGHPORT) 1175 optval = IP_PORTRANGE_HIGH; 1176 else if (inp->inp_flags & INP_LOWPORT) 1177 optval = IP_PORTRANGE_LOW; 1178 else 1179 optval = 0; 1180 1181 *mtod(m, int *) = optval; 1182 break; 1183 1184 case IP_AUTH_LEVEL: 1185 case IP_ESP_TRANS_LEVEL: 1186 case IP_ESP_NETWORK_LEVEL: 1187 case IP_IPCOMP_LEVEL: 1188#ifndef IPSEC 1189 m->m_len = sizeof(int); 1190 *mtod(m, int *) = IPSEC_LEVEL_NONE; 1191#else 1192 m->m_len = sizeof(int); 1193 switch (optname) { 1194 case IP_AUTH_LEVEL: 1195 optval = inp->inp_seclevel.sl_auth; 1196 break; 1197 1198 case IP_ESP_TRANS_LEVEL: 1199 optval = inp->inp_seclevel.sl_esp_trans; 1200 break; 1201 1202 case IP_ESP_NETWORK_LEVEL: 1203 optval = inp->inp_seclevel.sl_esp_network; 1204 break; 1205 case IP_IPCOMP_LEVEL: 1206 optval = inp->inp_seclevel.sl_ipcomp; 1207 break; 1208 } 1209 *mtod(m, int *) = optval; 1210#endif 1211 break; 1212 case IP_IPSEC_LOCAL_ID: 1213 case IP_IPSEC_REMOTE_ID: 1214 error = EOPNOTSUPP; 1215 break; 1216 case SO_RTABLE: 1217 m->m_len = sizeof(u_int); 1218 *mtod(m, u_int *) = inp->inp_rtableid; 1219 break; 1220 case IP_PIPEX: 1221 m->m_len = sizeof(int); 1222 *mtod(m, int *) = inp->inp_pipex; 1223 break; 1224 default: 1225 error = ENOPROTOOPT; 1226 break; 1227 } 1228 break; 1229 } 1230 return (error); 1231} 1232 1233/* 1234 * Set up IP options in pcb for insertion in output packets. 1235 * Store in mbuf with pointer in pcbopt, adding pseudo-option 1236 * with destination address if source routed. 1237 */ 1238int 1239ip_pcbopts(struct mbuf **pcbopt, struct mbuf *m) 1240{ 1241 struct mbuf *n; 1242 struct ipoption *p; 1243 int cnt, off, optlen; 1244 u_char *cp; 1245 u_char opt; 1246 1247 /* turn off any old options */ 1248 m_freem(*pcbopt); 1249 *pcbopt = NULL; 1250 if (m == NULL || m->m_len == 0) { 1251 /* 1252 * Only turning off any previous options. 1253 */ 1254 return (0); 1255 } 1256 1257 if (m->m_len % sizeof(int32_t) || 1258 m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) 1259 return (EINVAL); 1260 1261 /* Don't sleep because NET_LOCK() is hold. */ 1262 if ((n = m_get(M_NOWAIT, MT_SOOPTS)) == NULL) 1263 return (ENOBUFS); 1264 p = mtod(n, struct ipoption *); 1265 memset(p, 0, sizeof (*p)); /* 0 = IPOPT_EOL, needed for padding */ 1266 n->m_len = sizeof(struct in_addr); 1267 1268 off = 0; 1269 cnt = m->m_len; 1270 cp = mtod(m, u_char *); 1271 1272 while (cnt > 0) { 1273 opt = cp[IPOPT_OPTVAL]; 1274 1275 if (opt == IPOPT_NOP || opt == IPOPT_EOL) { 1276 optlen = 1; 1277 } else { 1278 if (cnt < IPOPT_OLEN + sizeof(*cp)) 1279 goto bad; 1280 optlen = cp[IPOPT_OLEN]; 1281 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) 1282 goto bad; 1283 } 1284 switch (opt) { 1285 default: 1286 memcpy(p->ipopt_list + off, cp, optlen); 1287 break; 1288 1289 case IPOPT_LSRR: 1290 case IPOPT_SSRR: 1291 /* 1292 * user process specifies route as: 1293 * ->A->B->C->D 1294 * D must be our final destination (but we can't 1295 * check that since we may not have connected yet). 1296 * A is first hop destination, which doesn't appear in 1297 * actual IP option, but is stored before the options. 1298 */ 1299 if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) 1300 goto bad; 1301 1302 /* 1303 * Optlen is smaller because first address is popped. 1304 * Cnt and cp will be adjusted a bit later to reflect 1305 * this. 1306 */ 1307 optlen -= sizeof(struct in_addr); 1308 p->ipopt_list[off + IPOPT_OPTVAL] = opt; 1309 p->ipopt_list[off + IPOPT_OLEN] = optlen; 1310 1311 /* 1312 * Move first hop before start of options. 1313 */ 1314 memcpy(&p->ipopt_dst, cp + IPOPT_OFFSET, 1315 sizeof(struct in_addr)); 1316 cp += sizeof(struct in_addr); 1317 cnt -= sizeof(struct in_addr); 1318 /* 1319 * Then copy rest of options 1320 */ 1321 memcpy(p->ipopt_list + off + IPOPT_OFFSET, 1322 cp + IPOPT_OFFSET, optlen - IPOPT_OFFSET); 1323 break; 1324 } 1325 off += optlen; 1326 cp += optlen; 1327 cnt -= optlen; 1328 1329 if (opt == IPOPT_EOL) 1330 break; 1331 } 1332 /* pad options to next word, since p was zeroed just adjust off */ 1333 off = (off + sizeof(int32_t) - 1) & ~(sizeof(int32_t) - 1); 1334 n->m_len += off; 1335 if (n->m_len > sizeof(*p)) { 1336 bad: 1337 m_freem(n); 1338 return (EINVAL); 1339 } 1340 1341 *pcbopt = n; 1342 return (0); 1343} 1344 1345/* 1346 * Lookup the interface based on the information in the ip_mreqn struct. 1347 */ 1348int 1349ip_multicast_if(struct ip_mreqn *mreq, u_int rtableid, unsigned int *ifidx) 1350{ 1351 struct sockaddr_in sin; 1352 struct rtentry *rt; 1353 1354 /* 1355 * In case userland provides the imr_ifindex use this as interface. 1356 * If no interface address was provided, use the interface of 1357 * the route to the given multicast address. 1358 */ 1359 if (mreq->imr_ifindex != 0) { 1360 *ifidx = mreq->imr_ifindex; 1361 } else if (mreq->imr_address.s_addr == INADDR_ANY) { 1362 memset(&sin, 0, sizeof(sin)); 1363 sin.sin_len = sizeof(sin); 1364 sin.sin_family = AF_INET; 1365 sin.sin_addr = mreq->imr_multiaddr; 1366 rt = rtalloc(sintosa(&sin), RT_RESOLVE, rtableid); 1367 if (!rtisvalid(rt)) { 1368 rtfree(rt); 1369 return EADDRNOTAVAIL; 1370 } 1371 *ifidx = rt->rt_ifidx; 1372 rtfree(rt); 1373 } else { 1374 memset(&sin, 0, sizeof(sin)); 1375 sin.sin_len = sizeof(sin); 1376 sin.sin_family = AF_INET; 1377 sin.sin_addr = mreq->imr_address; 1378 rt = rtalloc(sintosa(&sin), 0, rtableid); 1379 if (!rtisvalid(rt) || !ISSET(rt->rt_flags, RTF_LOCAL)) { 1380 rtfree(rt); 1381 return EADDRNOTAVAIL; 1382 } 1383 *ifidx = rt->rt_ifidx; 1384 rtfree(rt); 1385 } 1386 1387 return 0; 1388} 1389 1390/* 1391 * Set the IP multicast options in response to user setsockopt(). 1392 */ 1393int 1394ip_setmoptions(int optname, struct ip_moptions **imop, struct mbuf *m, 1395 u_int rtableid) 1396{ 1397 struct in_addr addr; 1398 struct in_ifaddr *ia; 1399 struct ip_mreqn mreqn; 1400 struct ifnet *ifp = NULL; 1401 struct ip_moptions *imo = *imop; 1402 struct in_multi **immp; 1403 struct sockaddr_in sin; 1404 unsigned int ifidx; 1405 int i, error = 0; 1406 u_char loop; 1407 1408 if (imo == NULL) { 1409 /* 1410 * No multicast option buffer attached to the pcb; 1411 * allocate one and initialize to default values. 1412 */ 1413 imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK|M_ZERO); 1414 immp = mallocarray(IP_MIN_MEMBERSHIPS, sizeof(*immp), M_IPMOPTS, 1415 M_WAITOK|M_ZERO); 1416 *imop = imo; 1417 imo->imo_ifidx = 0; 1418 imo->imo_ttl = IP_DEFAULT_MULTICAST_TTL; 1419 imo->imo_loop = IP_DEFAULT_MULTICAST_LOOP; 1420 imo->imo_num_memberships = 0; 1421 imo->imo_max_memberships = IP_MIN_MEMBERSHIPS; 1422 imo->imo_membership = immp; 1423 } 1424 1425 switch (optname) { 1426 1427 case IP_MULTICAST_IF: 1428 /* 1429 * Select the interface for outgoing multicast packets. 1430 */ 1431 if (m == NULL) { 1432 error = EINVAL; 1433 break; 1434 } 1435 if (m->m_len == sizeof(struct in_addr)) { 1436 addr = *(mtod(m, struct in_addr *)); 1437 } else if (m->m_len == sizeof(struct ip_mreq) || 1438 m->m_len == sizeof(struct ip_mreqn)) { 1439 memset(&mreqn, 0, sizeof(mreqn)); 1440 memcpy(&mreqn, mtod(m, void *), m->m_len); 1441 1442 /* 1443 * If an interface index is given use this 1444 * index to set the imo_ifidx but check first 1445 * that the interface actually exists. 1446 * In the other case just set the addr to 1447 * the imr_address and fall through to the 1448 * regular code. 1449 */ 1450 if (mreqn.imr_ifindex != 0) { 1451 ifp = if_get(mreqn.imr_ifindex); 1452 if (ifp == NULL || 1453 ifp->if_rdomain != rtable_l2(rtableid)) { 1454 error = EADDRNOTAVAIL; 1455 if_put(ifp); 1456 break; 1457 } 1458 imo->imo_ifidx = ifp->if_index; 1459 if_put(ifp); 1460 break; 1461 } else 1462 addr = mreqn.imr_address; 1463 } else { 1464 error = EINVAL; 1465 break; 1466 } 1467 /* 1468 * INADDR_ANY is used to remove a previous selection. 1469 * When no interface is selected, a default one is 1470 * chosen every time a multicast packet is sent. 1471 */ 1472 if (addr.s_addr == INADDR_ANY) { 1473 imo->imo_ifidx = 0; 1474 break; 1475 } 1476 /* 1477 * The selected interface is identified by its local 1478 * IP address. Find the interface and confirm that 1479 * it supports multicasting. 1480 */ 1481 memset(&sin, 0, sizeof(sin)); 1482 sin.sin_len = sizeof(sin); 1483 sin.sin_family = AF_INET; 1484 sin.sin_addr = addr; 1485 ia = ifatoia(ifa_ifwithaddr(sintosa(&sin), rtableid)); 1486 if (ia == NULL || 1487 (ia->ia_ifp->if_flags & IFF_MULTICAST) == 0) { 1488 error = EADDRNOTAVAIL; 1489 break; 1490 } 1491 imo->imo_ifidx = ia->ia_ifp->if_index; 1492 break; 1493 1494 case IP_MULTICAST_TTL: 1495 /* 1496 * Set the IP time-to-live for outgoing multicast packets. 1497 */ 1498 if (m == NULL || m->m_len != 1) { 1499 error = EINVAL; 1500 break; 1501 } 1502 imo->imo_ttl = *(mtod(m, u_char *)); 1503 break; 1504 1505 case IP_MULTICAST_LOOP: 1506 /* 1507 * Set the loopback flag for outgoing multicast packets. 1508 * Must be zero or one. 1509 */ 1510 if (m == NULL || m->m_len != 1 || 1511 (loop = *(mtod(m, u_char *))) > 1) { 1512 error = EINVAL; 1513 break; 1514 } 1515 imo->imo_loop = loop; 1516 break; 1517 1518 case IP_ADD_MEMBERSHIP: 1519 /* 1520 * Add a multicast group membership. 1521 * Group must be a valid IP multicast address. 1522 */ 1523 if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) || 1524 m->m_len == sizeof(struct ip_mreqn))) { 1525 error = EINVAL; 1526 break; 1527 } 1528 memset(&mreqn, 0, sizeof(mreqn)); 1529 memcpy(&mreqn, mtod(m, void *), m->m_len); 1530 if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) { 1531 error = EINVAL; 1532 break; 1533 } 1534 1535 error = ip_multicast_if(&mreqn, rtableid, &ifidx); 1536 if (error) 1537 break; 1538 1539 /* 1540 * See if we found an interface, and confirm that it 1541 * supports multicast. 1542 */ 1543 ifp = if_get(ifidx); 1544 if (ifp == NULL || ifp->if_rdomain != rtable_l2(rtableid) || 1545 (ifp->if_flags & IFF_MULTICAST) == 0) { 1546 error = EADDRNOTAVAIL; 1547 if_put(ifp); 1548 break; 1549 } 1550 1551 /* 1552 * See if the membership already exists or if all the 1553 * membership slots are full. 1554 */ 1555 for (i = 0; i < imo->imo_num_memberships; ++i) { 1556 if (imo->imo_membership[i]->inm_ifidx == ifidx && 1557 imo->imo_membership[i]->inm_addr.s_addr == 1558 mreqn.imr_multiaddr.s_addr) 1559 break; 1560 } 1561 if (i < imo->imo_num_memberships) { 1562 error = EADDRINUSE; 1563 if_put(ifp); 1564 break; 1565 } 1566 if (imo->imo_num_memberships == imo->imo_max_memberships) { 1567 struct in_multi **nmships, **omships; 1568 size_t newmax; 1569 /* 1570 * Resize the vector to next power-of-two minus 1. If 1571 * the size would exceed the maximum then we know we've 1572 * really run out of entries. Otherwise, we reallocate 1573 * the vector. 1574 */ 1575 nmships = NULL; 1576 omships = imo->imo_membership; 1577 newmax = ((imo->imo_max_memberships + 1) * 2) - 1; 1578 if (newmax <= IP_MAX_MEMBERSHIPS) { 1579 nmships = mallocarray(newmax, sizeof(*nmships), 1580 M_IPMOPTS, M_NOWAIT|M_ZERO); 1581 if (nmships != NULL) { 1582 memcpy(nmships, omships, 1583 sizeof(*omships) * 1584 imo->imo_max_memberships); 1585 free(omships, M_IPMOPTS, 1586 sizeof(*omships) * 1587 imo->imo_max_memberships); 1588 imo->imo_membership = nmships; 1589 imo->imo_max_memberships = newmax; 1590 } 1591 } 1592 if (nmships == NULL) { 1593 error = ENOBUFS; 1594 if_put(ifp); 1595 break; 1596 } 1597 } 1598 /* 1599 * Everything looks good; add a new record to the multicast 1600 * address list for the given interface. 1601 */ 1602 if ((imo->imo_membership[i] = 1603 in_addmulti(&mreqn.imr_multiaddr, ifp)) == NULL) { 1604 error = ENOBUFS; 1605 if_put(ifp); 1606 break; 1607 } 1608 ++imo->imo_num_memberships; 1609 if_put(ifp); 1610 break; 1611 1612 case IP_DROP_MEMBERSHIP: 1613 /* 1614 * Drop a multicast group membership. 1615 * Group must be a valid IP multicast address. 1616 */ 1617 if (m == NULL || !(m->m_len == sizeof(struct ip_mreq) || 1618 m->m_len == sizeof(struct ip_mreqn))) { 1619 error = EINVAL; 1620 break; 1621 } 1622 memset(&mreqn, 0, sizeof(mreqn)); 1623 memcpy(&mreqn, mtod(m, void *), m->m_len); 1624 if (!IN_MULTICAST(mreqn.imr_multiaddr.s_addr)) { 1625 error = EINVAL; 1626 break; 1627 } 1628 1629 /* 1630 * If an interface address was specified, get a pointer 1631 * to its ifnet structure. 1632 */ 1633 error = ip_multicast_if(&mreqn, rtableid, &ifidx); 1634 if (error) 1635 break; 1636 1637 /* 1638 * Find the membership in the membership array. 1639 */ 1640 for (i = 0; i < imo->imo_num_memberships; ++i) { 1641 if ((ifidx == 0 || 1642 imo->imo_membership[i]->inm_ifidx == ifidx) && 1643 imo->imo_membership[i]->inm_addr.s_addr == 1644 mreqn.imr_multiaddr.s_addr) 1645 break; 1646 } 1647 if (i == imo->imo_num_memberships) { 1648 error = EADDRNOTAVAIL; 1649 break; 1650 } 1651 /* 1652 * Give up the multicast address record to which the 1653 * membership points. 1654 */ 1655 in_delmulti(imo->imo_membership[i]); 1656 /* 1657 * Remove the gap in the membership array. 1658 */ 1659 for (++i; i < imo->imo_num_memberships; ++i) 1660 imo->imo_membership[i-1] = imo->imo_membership[i]; 1661 --imo->imo_num_memberships; 1662 break; 1663 1664 default: 1665 error = EOPNOTSUPP; 1666 break; 1667 } 1668 1669 /* 1670 * If all options have default values, no need to keep the data. 1671 */ 1672 if (imo->imo_ifidx == 0 && 1673 imo->imo_ttl == IP_DEFAULT_MULTICAST_TTL && 1674 imo->imo_loop == IP_DEFAULT_MULTICAST_LOOP && 1675 imo->imo_num_memberships == 0) { 1676 free(imo->imo_membership , M_IPMOPTS, 1677 imo->imo_max_memberships * sizeof(struct in_multi *)); 1678 free(*imop, M_IPMOPTS, sizeof(**imop)); 1679 *imop = NULL; 1680 } 1681 1682 return (error); 1683} 1684 1685/* 1686 * Return the IP multicast options in response to user getsockopt(). 1687 */ 1688int 1689ip_getmoptions(int optname, struct ip_moptions *imo, struct mbuf *m) 1690{ 1691 u_char *ttl; 1692 u_char *loop; 1693 struct in_addr *addr; 1694 struct in_ifaddr *ia; 1695 struct ifnet *ifp; 1696 1697 switch (optname) { 1698 1699 case IP_MULTICAST_IF: 1700 addr = mtod(m, struct in_addr *); 1701 m->m_len = sizeof(struct in_addr); 1702 if (imo == NULL || (ifp = if_get(imo->imo_ifidx)) == NULL) 1703 addr->s_addr = INADDR_ANY; 1704 else { 1705 ia = in_ifp2ia(ifp); 1706 addr->s_addr = (ia == NULL) ? INADDR_ANY : 1707 ia->ia_addr.sin_addr.s_addr; 1708 if_put(ifp); 1709 } 1710 return (0); 1711 1712 case IP_MULTICAST_TTL: 1713 ttl = mtod(m, u_char *); 1714 m->m_len = 1; 1715 *ttl = (imo == NULL) ? IP_DEFAULT_MULTICAST_TTL 1716 : imo->imo_ttl; 1717 return (0); 1718 1719 case IP_MULTICAST_LOOP: 1720 loop = mtod(m, u_char *); 1721 m->m_len = 1; 1722 *loop = (imo == NULL) ? IP_DEFAULT_MULTICAST_LOOP 1723 : imo->imo_loop; 1724 return (0); 1725 1726 default: 1727 return (EOPNOTSUPP); 1728 } 1729} 1730 1731/* 1732 * Discard the IP multicast options. 1733 */ 1734void 1735ip_freemoptions(struct ip_moptions *imo) 1736{ 1737 int i; 1738 1739 if (imo != NULL) { 1740 for (i = 0; i < imo->imo_num_memberships; ++i) 1741 in_delmulti(imo->imo_membership[i]); 1742 free(imo->imo_membership, M_IPMOPTS, 1743 imo->imo_max_memberships * sizeof(struct in_multi *)); 1744 free(imo, M_IPMOPTS, sizeof(*imo)); 1745 } 1746} 1747 1748/* 1749 * Routine called from ip_output() to loop back a copy of an IP multicast 1750 * packet to the input queue of a specified interface. 1751 */ 1752void 1753ip_mloopback(struct ifnet *ifp, struct mbuf *m, struct sockaddr_in *dst) 1754{ 1755 struct mbuf *copym; 1756 1757 copym = m_dup_pkt(m, max_linkhdr, M_DONTWAIT); 1758 if (copym != NULL) { 1759 /* 1760 * We don't bother to fragment if the IP length is greater 1761 * than the interface's MTU. Can this possibly matter? 1762 */ 1763 in_hdr_cksum_out(copym, NULL); 1764 if_input_local(ifp, copym, dst->sin_family, NULL); 1765 } 1766} 1767 1768void 1769in_hdr_cksum_out(struct mbuf *m, struct ifnet *ifp) 1770{ 1771 struct ip *ip = mtod(m, struct ip *); 1772 1773 ip->ip_sum = 0; 1774 if (in_ifcap_cksum(m, ifp, IFCAP_CSUM_IPv4)) { 1775 SET(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); 1776 } else { 1777 ipstat_inc(ips_outswcsum); 1778 ip->ip_sum = in_cksum(m, ip->ip_hl << 2); 1779 CLR(m->m_pkthdr.csum_flags, M_IPV4_CSUM_OUT); 1780 } 1781} 1782 1783/* 1784 * Compute significant parts of the IPv4 checksum pseudo-header 1785 * for use in a delayed TCP/UDP checksum calculation. 1786 */ 1787static u_int16_t 1788in_cksum_phdr(u_int32_t src, u_int32_t dst, u_int32_t lenproto) 1789{ 1790 u_int32_t sum; 1791 1792 sum = lenproto + 1793 (u_int16_t)(src >> 16) + 1794 (u_int16_t)(src /*& 0xffff*/) + 1795 (u_int16_t)(dst >> 16) + 1796 (u_int16_t)(dst /*& 0xffff*/); 1797 1798 sum = (u_int16_t)(sum >> 16) + (u_int16_t)(sum /*& 0xffff*/); 1799 1800 if (sum > 0xffff) 1801 sum -= 0xffff; 1802 1803 return (sum); 1804} 1805 1806/* 1807 * Process a delayed payload checksum calculation. 1808 */ 1809void 1810in_delayed_cksum(struct mbuf *m) 1811{ 1812 struct ip *ip; 1813 u_int16_t csum, offset; 1814 1815 ip = mtod(m, struct ip *); 1816 offset = ip->ip_hl << 2; 1817 csum = in4_cksum(m, 0, offset, m->m_pkthdr.len - offset); 1818 if (csum == 0 && ip->ip_p == IPPROTO_UDP) 1819 csum = 0xffff; 1820 1821 switch (ip->ip_p) { 1822 case IPPROTO_TCP: 1823 offset += offsetof(struct tcphdr, th_sum); 1824 break; 1825 1826 case IPPROTO_UDP: 1827 offset += offsetof(struct udphdr, uh_sum); 1828 break; 1829 1830 case IPPROTO_ICMP: 1831 offset += offsetof(struct icmp, icmp_cksum); 1832 break; 1833 1834 default: 1835 return; 1836 } 1837 1838 if ((offset + sizeof(u_int16_t)) > m->m_len) 1839 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); 1840 else 1841 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; 1842} 1843 1844void 1845in_proto_cksum_out(struct mbuf *m, struct ifnet *ifp) 1846{ 1847 struct ip *ip = mtod(m, struct ip *); 1848 1849 /* some hw and in_delayed_cksum need the pseudo header cksum */ 1850 if (m->m_pkthdr.csum_flags & 1851 (M_TCP_CSUM_OUT|M_UDP_CSUM_OUT|M_ICMP_CSUM_OUT)) { 1852 u_int16_t csum = 0, offset; 1853 1854 offset = ip->ip_hl << 2; 1855 if (ISSET(m->m_pkthdr.csum_flags, M_TCP_TSO) && 1856 in_ifcap_cksum(m, ifp, IFCAP_TSOv4)) { 1857 csum = in_cksum_phdr(ip->ip_src.s_addr, 1858 ip->ip_dst.s_addr, htonl(ip->ip_p)); 1859 } else if (ISSET(m->m_pkthdr.csum_flags, 1860 M_TCP_CSUM_OUT|M_UDP_CSUM_OUT)) { 1861 csum = in_cksum_phdr(ip->ip_src.s_addr, 1862 ip->ip_dst.s_addr, htonl(ntohs(ip->ip_len) - 1863 offset + ip->ip_p)); 1864 } 1865 if (ip->ip_p == IPPROTO_TCP) 1866 offset += offsetof(struct tcphdr, th_sum); 1867 else if (ip->ip_p == IPPROTO_UDP) 1868 offset += offsetof(struct udphdr, uh_sum); 1869 else if (ip->ip_p == IPPROTO_ICMP) 1870 offset += offsetof(struct icmp, icmp_cksum); 1871 if ((offset + sizeof(u_int16_t)) > m->m_len) 1872 m_copyback(m, offset, sizeof(csum), &csum, M_NOWAIT); 1873 else 1874 *(u_int16_t *)(mtod(m, caddr_t) + offset) = csum; 1875 } 1876 1877 if (m->m_pkthdr.csum_flags & M_TCP_CSUM_OUT) { 1878 if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_TCPv4) || 1879 ip->ip_hl != 5) { 1880 tcpstat_inc(tcps_outswcsum); 1881 in_delayed_cksum(m); 1882 m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_OUT; /* Clear */ 1883 } 1884 } else if (m->m_pkthdr.csum_flags & M_UDP_CSUM_OUT) { 1885 if (!in_ifcap_cksum(m, ifp, IFCAP_CSUM_UDPv4) || 1886 ip->ip_hl != 5) { 1887 udpstat_inc(udps_outswcsum); 1888 in_delayed_cksum(m); 1889 m->m_pkthdr.csum_flags &= ~M_UDP_CSUM_OUT; /* Clear */ 1890 } 1891 } else if (m->m_pkthdr.csum_flags & M_ICMP_CSUM_OUT) { 1892 in_delayed_cksum(m); 1893 m->m_pkthdr.csum_flags &= ~M_ICMP_CSUM_OUT; /* Clear */ 1894 } 1895} 1896 1897int 1898in_ifcap_cksum(struct mbuf *m, struct ifnet *ifp, int ifcap) 1899{ 1900 if ((ifp == NULL) || 1901 !ISSET(ifp->if_capabilities, ifcap) || 1902 (ifp->if_bridgeidx != 0)) 1903 return (0); 1904 /* 1905 * Simplex interface sends packet back without hardware cksum. 1906 * Keep this check in sync with the condition where ether_resolve() 1907 * calls if_input_local(). 1908 */ 1909 if (ISSET(m->m_flags, M_BCAST) && 1910 ISSET(ifp->if_flags, IFF_SIMPLEX) && 1911 !m->m_pkthdr.pf.routed) 1912 return (0); 1913 return (1); 1914}