jcs's openbsd hax
openbsd
at jcs 1654 lines 41 kB view raw
1/* $OpenBSD: tcp_usrreq.c,v 1.253 2025/10/24 15:09:56 bluhm Exp $ */ 2/* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ 3 4/* 5 * Copyright (c) 1982, 1986, 1988, 1993 6 * The Regents of the University of California. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. Neither the name of the University nor the names of its contributors 17 * may be used to endorse or promote products derived from this software 18 * without specific prior written permission. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30 * SUCH DAMAGE. 31 * 32 * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995 33 * 34 * NRL grants permission for redistribution and use in source and binary 35 * forms, with or without modification, of the software and documentation 36 * created at NRL provided that the following conditions are met: 37 * 38 * 1. Redistributions of source code must retain the above copyright 39 * notice, this list of conditions and the following disclaimer. 40 * 2. Redistributions in binary form must reproduce the above copyright 41 * notice, this list of conditions and the following disclaimer in the 42 * documentation and/or other materials provided with the distribution. 43 * 3. All advertising materials mentioning features or use of this software 44 * must display the following acknowledgements: 45 * This product includes software developed by the University of 46 * California, Berkeley and its contributors. 47 * This product includes software developed at the Information 48 * Technology Division, US Naval Research Laboratory. 49 * 4. Neither the name of the NRL nor the names of its contributors 50 * may be used to endorse or promote products derived from this software 51 * without specific prior written permission. 52 * 53 * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS 54 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 56 * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR 57 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 58 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 59 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 60 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 61 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 62 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 63 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 * 65 * The views and conclusions contained in the software and documentation 66 * are those of the authors and should not be interpreted as representing 67 * official policies, either expressed or implied, of the US Naval 68 * Research Laboratory (NRL). 69 */ 70 71#include <sys/param.h> 72#include <sys/systm.h> 73#include <sys/mbuf.h> 74#include <sys/socket.h> 75#include <sys/protosw.h> 76#include <sys/stat.h> 77#include <sys/sysctl.h> 78#include <sys/domain.h> 79#include <sys/pool.h> 80 81#include <net/if.h> 82#include <net/if_var.h> 83 84#include <netinet/in.h> 85#include <netinet/in_var.h> 86#include <netinet/ip.h> 87#include <netinet/in_pcb.h> 88#include <netinet/ip_var.h> 89#include <netinet6/ip6_var.h> 90#include <netinet/tcp.h> 91#include <netinet/tcp_fsm.h> 92#include <netinet/tcp_seq.h> 93#include <netinet/tcp_timer.h> 94#include <netinet/tcp_var.h> 95#include <netinet/tcp_debug.h> 96 97#ifdef INET6 98#include <netinet6/in6_var.h> 99#endif 100 101/* 102 * Locks used to protect global variables in this file: 103 * I immutable after creation 104 */ 105 106#ifndef TCP_SENDSPACE 107#define TCP_SENDSPACE 1024*16 108#endif 109u_int tcp_sendspace = TCP_SENDSPACE; /* [I] */ 110#ifndef TCP_RECVSPACE 111#define TCP_RECVSPACE 1024*16 112#endif 113u_int tcp_recvspace = TCP_RECVSPACE; /* [I] */ 114u_int tcp_autorcvbuf_inc = 16 * 1024; /* [I] */ 115 116const struct pr_usrreqs tcp_usrreqs = { 117 .pru_attach = tcp_attach, 118 .pru_detach = tcp_detach, 119 .pru_bind = tcp_bind, 120 .pru_listen = tcp_listen, 121 .pru_connect = tcp_connect, 122 .pru_accept = tcp_accept, 123 .pru_disconnect = tcp_disconnect, 124 .pru_shutdown = tcp_shutdown, 125 .pru_rcvd = tcp_rcvd, 126 .pru_send = tcp_send, 127 .pru_abort = tcp_abort, 128 .pru_sense = tcp_sense, 129 .pru_rcvoob = tcp_rcvoob, 130 .pru_sendoob = tcp_sendoob, 131 .pru_control = in_control, 132 .pru_sockaddr = tcp_sockaddr, 133 .pru_peeraddr = tcp_peeraddr, 134 .pru_flowid = in_flowid, 135}; 136 137#ifdef INET6 138const struct pr_usrreqs tcp6_usrreqs = { 139 .pru_attach = tcp_attach, 140 .pru_detach = tcp_detach, 141 .pru_bind = tcp_bind, 142 .pru_listen = tcp_listen, 143 .pru_connect = tcp_connect, 144 .pru_accept = tcp_accept, 145 .pru_disconnect = tcp_disconnect, 146 .pru_shutdown = tcp_shutdown, 147 .pru_rcvd = tcp_rcvd, 148 .pru_send = tcp_send, 149 .pru_abort = tcp_abort, 150 .pru_sense = tcp_sense, 151 .pru_rcvoob = tcp_rcvoob, 152 .pru_sendoob = tcp_sendoob, 153 .pru_control = in6_control, 154 .pru_sockaddr = tcp_sockaddr, 155 .pru_peeraddr = tcp_peeraddr, 156 .pru_flowid = in_flowid, 157}; 158#endif 159 160#ifndef SMALL_KERNEL 161const struct sysctl_bounded_args tcpctl_vars[] = { 162 { TCPCTL_KEEPINITTIME, &tcp_keepinit_sec, 1, 163 3 * TCPTV_KEEPINIT / TCP_TIME(1) }, 164 { TCPCTL_KEEPIDLE, &tcp_keepidle_sec, 1, 165 5 * TCPTV_KEEPIDLE / TCP_TIME(1) }, 166 { TCPCTL_KEEPINTVL, &tcp_keepintvl_sec, 1, 167 3 * TCPTV_KEEPINTVL / TCP_TIME(1) }, 168 { TCPCTL_RFC1323, &tcp_do_rfc1323, 0, 1 }, 169 { TCPCTL_SACK, &tcp_do_sack, 0, 1 }, 170 { TCPCTL_MSSDFLT, &tcp_mssdflt, TCP_MSS, 65535 }, 171 { TCPCTL_RSTPPSLIMIT, &tcp_rst_ppslim, 1, 1000 * 1000 }, 172 { TCPCTL_ACK_ON_PUSH, &tcp_ack_on_push, 0, 1 }, 173#ifdef TCP_ECN 174 { TCPCTL_ECN, &tcp_do_ecn, 0, 1 }, 175#endif 176 { TCPCTL_SYN_CACHE_LIMIT, &tcp_syn_cache_limit, 1, 1000 * 1000 }, 177 { TCPCTL_SYN_BUCKET_LIMIT, &tcp_syn_bucket_limit, 1, INT_MAX }, 178 { TCPCTL_RFC3390, &tcp_do_rfc3390, 0, 2 }, 179 { TCPCTL_ALWAYS_KEEPALIVE, &tcp_always_keepalive, 0, 1 }, 180 { TCPCTL_TSO, &tcp_do_tso, 0, 1 }, 181}; 182#endif /* SMALL_KERNEL */ 183 184struct inpcbtable tcbtable; 185#ifdef INET6 186struct inpcbtable tcb6table; 187#endif 188 189int tcp_fill_info(struct tcpcb *, struct socket *, struct mbuf *); 190int tcp_ident(void *, size_t *, void *, size_t, int); 191 192static inline int tcp_sogetpcb(struct socket *, struct inpcb **, 193 struct tcpcb **); 194 195static inline int 196tcp_sogetpcb(struct socket *so, struct inpcb **rinp, struct tcpcb **rtp) 197{ 198 struct inpcb *inp; 199 struct tcpcb *tp; 200 201 /* 202 * When a TCP is attached to a socket, then there will be 203 * a (struct inpcb) pointed at by the socket, and this 204 * structure will point at a subsidiary (struct tcpcb). 205 */ 206 if ((inp = sotoinpcb(so)) == NULL || (tp = intotcpcb(inp)) == NULL) { 207 int error; 208 209 if ((error = READ_ONCE(so->so_error))) 210 return error; 211 return EINVAL; 212 } 213 214 *rinp = inp; 215 *rtp = tp; 216 217 return 0; 218} 219 220/* 221 * Export internal TCP state information via a struct tcp_info without 222 * leaking any sensitive information. Sequence numbers are reported 223 * relative to the initial sequence number. 224 */ 225int 226tcp_fill_info(struct tcpcb *tp, struct socket *so, struct mbuf *m) 227{ 228 struct proc *p = curproc; 229 struct tcp_info *ti; 230 u_int t = 1000; /* msec => usec */ 231 uint64_t now; 232 233 if (sizeof(*ti) > MLEN) { 234 MCLGETL(m, M_WAITOK, sizeof(*ti)); 235 if (!ISSET(m->m_flags, M_EXT)) 236 return ENOMEM; 237 } 238 ti = mtod(m, struct tcp_info *); 239 m->m_len = sizeof(*ti); 240 memset(ti, 0, sizeof(*ti)); 241 now = tcp_now(); 242 243 ti->tcpi_state = tp->t_state; 244 if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) 245 ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; 246 if (tp->t_flags & TF_SACK_PERMIT) 247 ti->tcpi_options |= TCPI_OPT_SACK; 248 if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { 249 ti->tcpi_options |= TCPI_OPT_WSCALE; 250 ti->tcpi_snd_wscale = tp->snd_scale; 251 ti->tcpi_rcv_wscale = tp->rcv_scale; 252 } 253#ifdef TCP_ECN 254 if (tp->t_flags & TF_ECN_PERMIT) 255 ti->tcpi_options |= TCPI_OPT_ECN; 256#endif 257 258 ti->tcpi_rto = tp->t_rxtcur * t; 259 ti->tcpi_snd_mss = tp->t_maxseg; 260 ti->tcpi_rcv_mss = tp->t_peermss; 261 262 ti->tcpi_last_data_sent = (now - tp->t_sndtime) * t; 263 ti->tcpi_last_ack_sent = (now - tp->t_sndacktime) * t; 264 ti->tcpi_last_data_recv = (now - tp->t_rcvtime) * t; 265 ti->tcpi_last_ack_recv = (now - tp->t_rcvacktime) * t; 266 267 ti->tcpi_rtt = ((uint64_t)tp->t_srtt * t) >> 268 (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT); 269 ti->tcpi_rttvar = ((uint64_t)tp->t_rttvar * t) >> 270 (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT); 271 ti->tcpi_snd_ssthresh = tp->snd_ssthresh; 272 ti->tcpi_snd_cwnd = tp->snd_cwnd; 273 274 ti->tcpi_rcv_space = tp->rcv_wnd; 275 276 /* 277 * Provide only minimal information for unprivileged processes. 278 */ 279 if (suser(p) != 0) 280 return 0; 281 282 /* FreeBSD-specific extension fields for tcp_info. */ 283 ti->tcpi_snd_wnd = tp->snd_wnd; 284 ti->tcpi_snd_nxt = tp->snd_nxt - tp->iss; 285 ti->tcpi_rcv_nxt = tp->rcv_nxt - tp->irs; 286 /* missing tcpi_toe_tid */ 287 ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; 288 ti->tcpi_rcv_ooopack = tp->t_rcvoopack; 289 ti->tcpi_snd_zerowin = tp->t_sndzerowin; 290 291 /* OpenBSD extensions */ 292 ti->tcpi_rttmin = tp->t_rttmin * t; 293 ti->tcpi_max_sndwnd = tp->max_sndwnd; 294 ti->tcpi_rcv_adv = tp->rcv_adv - tp->irs; 295 ti->tcpi_rcv_up = tp->rcv_up - tp->irs; 296 ti->tcpi_snd_una = tp->snd_una - tp->iss; 297 ti->tcpi_snd_up = tp->snd_up - tp->iss; 298 ti->tcpi_snd_wl1 = tp->snd_wl1 - tp->iss; 299 ti->tcpi_snd_wl2 = tp->snd_wl2 - tp->iss; 300 ti->tcpi_snd_max = tp->snd_max - tp->iss; 301 302 ti->tcpi_ts_recent = tp->ts_recent; /* XXX value from the wire */ 303 ti->tcpi_ts_recent_age = (now - tp->ts_recent_age) * t; 304 ti->tcpi_rfbuf_cnt = tp->rfbuf_cnt; 305 ti->tcpi_rfbuf_ts = (now - tp->rfbuf_ts) * t; 306 307 mtx_enter(&so->so_rcv.sb_mtx); 308 ti->tcpi_so_rcv_sb_cc = so->so_rcv.sb_cc; 309 ti->tcpi_so_rcv_sb_hiwat = so->so_rcv.sb_hiwat; 310 ti->tcpi_so_rcv_sb_lowat = so->so_rcv.sb_lowat; 311 ti->tcpi_so_rcv_sb_wat = so->so_rcv.sb_wat; 312 mtx_leave(&so->so_rcv.sb_mtx); 313 mtx_enter(&so->so_snd.sb_mtx); 314 ti->tcpi_so_snd_sb_cc = so->so_snd.sb_cc; 315 ti->tcpi_so_snd_sb_hiwat = so->so_snd.sb_hiwat; 316 ti->tcpi_so_snd_sb_lowat = so->so_snd.sb_lowat; 317 ti->tcpi_so_snd_sb_wat = so->so_snd.sb_wat; 318 mtx_leave(&so->so_snd.sb_mtx); 319 320 return 0; 321} 322 323int 324tcp_ctloutput(int op, struct socket *so, int level, int optname, 325 struct mbuf *m) 326{ 327 int error = 0; 328 struct inpcb *inp; 329 struct tcpcb *tp; 330 int i; 331 332 inp = sotoinpcb(so); 333 if (inp == NULL) 334 return (ECONNRESET); 335 if (level != IPPROTO_TCP) { 336#ifdef INET6 337 if (ISSET(inp->inp_flags, INP_IPV6)) 338 error = ip6_ctloutput(op, so, level, optname, m); 339 else 340#endif 341 error = ip_ctloutput(op, so, level, optname, m); 342 return (error); 343 } 344 tp = intotcpcb(inp); 345 346 switch (op) { 347 348 case PRCO_SETOPT: 349 switch (optname) { 350 351 case TCP_NODELAY: 352 if (m == NULL || m->m_len < sizeof (int)) 353 error = EINVAL; 354 else if (*mtod(m, int *)) 355 tp->t_flags |= TF_NODELAY; 356 else 357 tp->t_flags &= ~TF_NODELAY; 358 break; 359 360 case TCP_NOPUSH: 361 if (m == NULL || m->m_len < sizeof (int)) 362 error = EINVAL; 363 else if (*mtod(m, int *)) 364 tp->t_flags |= TF_NOPUSH; 365 else if (tp->t_flags & TF_NOPUSH) { 366 tp->t_flags &= ~TF_NOPUSH; 367 if (TCPS_HAVEESTABLISHED(tp->t_state)) 368 error = tcp_output(tp); 369 } 370 break; 371 372 case TCP_MAXSEG: 373 if (m == NULL || m->m_len < sizeof (int)) { 374 error = EINVAL; 375 break; 376 } 377 378 i = *mtod(m, int *); 379 if (i > 0 && i <= tp->t_maxseg) 380 tp->t_maxseg = i; 381 else 382 error = EINVAL; 383 break; 384 385 case TCP_SACK_ENABLE: 386 if (m == NULL || m->m_len < sizeof (int)) { 387 error = EINVAL; 388 break; 389 } 390 391 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 392 error = EPERM; 393 break; 394 } 395 396 if (tp->t_flags & TF_SIGNATURE) { 397 error = EPERM; 398 break; 399 } 400 401 if (*mtod(m, int *)) 402 tp->sack_enable = 1; 403 else 404 tp->sack_enable = 0; 405 break; 406#ifdef TCP_SIGNATURE 407 case TCP_MD5SIG: 408 if (m == NULL || m->m_len < sizeof (int)) { 409 error = EINVAL; 410 break; 411 } 412 413 if (TCPS_HAVEESTABLISHED(tp->t_state)) { 414 error = EPERM; 415 break; 416 } 417 418 if (*mtod(m, int *)) { 419 tp->t_flags |= TF_SIGNATURE; 420 tp->sack_enable = 0; 421 } else 422 tp->t_flags &= ~TF_SIGNATURE; 423 break; 424#endif /* TCP_SIGNATURE */ 425 default: 426 error = ENOPROTOOPT; 427 break; 428 } 429 break; 430 431 case PRCO_GETOPT: 432 switch (optname) { 433 case TCP_NODELAY: 434 m->m_len = sizeof(int); 435 *mtod(m, int *) = tp->t_flags & TF_NODELAY; 436 break; 437 case TCP_NOPUSH: 438 m->m_len = sizeof(int); 439 *mtod(m, int *) = tp->t_flags & TF_NOPUSH; 440 break; 441 case TCP_MAXSEG: 442 m->m_len = sizeof(int); 443 *mtod(m, int *) = tp->t_maxseg; 444 break; 445 case TCP_SACK_ENABLE: 446 m->m_len = sizeof(int); 447 *mtod(m, int *) = tp->sack_enable; 448 break; 449 case TCP_INFO: 450 error = tcp_fill_info(tp, so, m); 451 break; 452#ifdef TCP_SIGNATURE 453 case TCP_MD5SIG: 454 m->m_len = sizeof(int); 455 *mtod(m, int *) = tp->t_flags & TF_SIGNATURE; 456 break; 457#endif 458 default: 459 error = ENOPROTOOPT; 460 break; 461 } 462 break; 463 } 464 return (error); 465} 466 467/* 468 * Attach TCP protocol to socket, allocating 469 * internet protocol control block, tcp control block, 470 * buffer space, and entering LISTEN state to accept connections. 471 */ 472int 473tcp_attach(struct socket *so, int proto, int wait) 474{ 475 struct inpcbtable *table; 476 struct tcpcb *tp; 477 struct inpcb *inp; 478 int error; 479 480 if (so->so_pcb) 481 return EISCONN; 482 if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0 || 483 sbcheckreserve(so->so_snd.sb_wat, tcp_sendspace) || 484 sbcheckreserve(so->so_rcv.sb_wat, tcp_recvspace)) { 485 error = soreserve(so, tcp_sendspace, tcp_recvspace); 486 if (error) 487 return (error); 488 } 489 490#ifdef INET6 491 if (so->so_proto->pr_domain->dom_family == PF_INET6) 492 table = &tcb6table; 493 else 494#endif 495 table = &tcbtable; 496 error = in_pcballoc(so, table, wait); 497 if (error) 498 return (error); 499 inp = sotoinpcb(so); 500 tp = tcp_newtcpcb(inp, wait); 501 if (tp == NULL) { 502 unsigned int nofd = so->so_state & SS_NOFDREF; /* XXX */ 503 504 so->so_state &= ~SS_NOFDREF; /* don't free the socket yet */ 505 in_pcbdetach(inp); 506 so->so_state |= nofd; 507 return (ENOBUFS); 508 } 509 tp->t_state = TCPS_CLOSED; 510#ifdef INET6 511 if (ISSET(inp->inp_flags, INP_IPV6)) 512 tp->pf = PF_INET6; 513 else 514#endif 515 tp->pf = PF_INET; 516 if ((so->so_options & SO_LINGER) && so->so_linger == 0) 517 so->so_linger = TCP_LINGERTIME; 518 519 if (so->so_options & SO_DEBUG) 520 tcp_trace(TA_USER, TCPS_CLOSED, tp, tp, NULL, PRU_ATTACH, 0); 521 return (0); 522} 523 524int 525tcp_detach(struct socket *so) 526{ 527 struct inpcb *inp; 528 struct tcpcb *otp = NULL, *tp; 529 int error; 530 short ostate; 531 532 soassertlocked(so); 533 534 if ((error = tcp_sogetpcb(so, &inp, &tp))) 535 return (error); 536 537 if (so->so_options & SO_DEBUG) { 538 otp = tp; 539 ostate = tp->t_state; 540 } 541 542 /* 543 * Detach the TCP protocol from the socket. 544 * If the protocol state is non-embryonic, then can't 545 * do this directly: have to initiate a PRU_DISCONNECT, 546 * which may finish later; embryonic TCB's can just 547 * be discarded here. 548 */ 549 tp = tcp_dodisconnect(tp); 550 551 if (otp) 552 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DETACH, 0); 553 return (0); 554} 555 556/* 557 * Give the socket an address. 558 */ 559int 560tcp_bind(struct socket *so, struct mbuf *nam, struct proc *p) 561{ 562 struct inpcb *inp; 563 struct tcpcb *tp; 564 int error; 565 short ostate; 566 567 soassertlocked(so); 568 569 if ((error = tcp_sogetpcb(so, &inp, &tp))) 570 return (error); 571 572 if (so->so_options & SO_DEBUG) 573 ostate = tp->t_state; 574 575 error = in_pcbbind(inp, nam, p); 576 577 if (so->so_options & SO_DEBUG) 578 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_BIND, 0); 579 return (error); 580} 581 582/* 583 * Prepare to accept connections. 584 */ 585int 586tcp_listen(struct socket *so) 587{ 588 struct inpcb *inp; 589 struct tcpcb *tp, *otp = NULL; 590 int error; 591 short ostate; 592 593 soassertlocked(so); 594 595 if ((error = tcp_sogetpcb(so, &inp, &tp))) 596 return (error); 597 598 if (so->so_options & SO_DEBUG) { 599 otp = tp; 600 ostate = tp->t_state; 601 } 602 603 if (inp->inp_lport == 0) 604 if ((error = in_pcbbind(inp, NULL, curproc))) 605 goto out; 606 607 /* 608 * If the in_pcbbind() above is called, the tp->pf 609 * should still be whatever it was before. 610 */ 611 tp->t_state = TCPS_LISTEN; 612 613out: 614 if (otp) 615 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_LISTEN, 0); 616 return (error); 617} 618 619/* 620 * Initiate connection to peer. 621 * Create a template for use in transmissions on this connection. 622 * Enter SYN_SENT state, and mark socket as connecting. 623 * Start keep-alive timer, and seed output sequence space. 624 * Send initial segment on connection. 625 */ 626int 627tcp_connect(struct socket *so, struct mbuf *nam) 628{ 629 struct inpcb *inp; 630 struct tcpcb *tp, *otp = NULL; 631 int error; 632 short ostate; 633 634 soassertlocked(so); 635 636 if ((error = tcp_sogetpcb(so, &inp, &tp))) 637 return (error); 638 639 if (so->so_options & SO_DEBUG) { 640 otp = tp; 641 ostate = tp->t_state; 642 } 643 644#ifdef INET6 645 if (ISSET(inp->inp_flags, INP_IPV6)) { 646 struct sockaddr_in6 *sin6; 647 648 if ((error = in6_nam2sin6(nam, &sin6))) 649 goto out; 650 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || 651 IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { 652 error = EINVAL; 653 goto out; 654 } 655 } else 656#endif 657 { 658 struct sockaddr_in *sin; 659 660 if ((error = in_nam2sin(nam, &sin))) 661 goto out; 662 if ((sin->sin_addr.s_addr == INADDR_ANY) || 663 (sin->sin_addr.s_addr == INADDR_BROADCAST) || 664 IN_MULTICAST(sin->sin_addr.s_addr) || 665 in_broadcast(sin->sin_addr, inp->inp_rtableid)) { 666 error = EINVAL; 667 goto out; 668 } 669 } 670 error = in_pcbconnect(inp, nam); 671 if (error) 672 goto out; 673 674 tp->t_template = tcp_template(tp); 675 if (tp->t_template == 0) { 676 in_pcbunset_faddr(inp); 677 in_pcbdisconnect(inp); 678 error = ENOBUFS; 679 goto out; 680 } 681 682 so->so_state |= SS_CONNECTOUT; 683 684 /* Compute window scaling to request. */ 685 tcp_rscale(tp, sb_max); 686 687 soisconnecting(so); 688 tcpstat_inc(tcps_connattempt); 689 tp->t_state = TCPS_SYN_SENT; 690 TCP_TIMER_ARM(tp, TCPT_KEEP, atomic_load_int(&tcp_keepinit)); 691 tcp_set_iss_tsm(tp); 692 tcp_sendseqinit(tp); 693 tp->snd_last = tp->snd_una; 694 error = tcp_output(tp); 695 696out: 697 if (otp) 698 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_CONNECT, 0); 699 return (error); 700} 701 702/* 703 * Accept a connection. Essentially all the work is done at higher 704 * levels; just return the address of the peer, storing through addr. 705 */ 706int 707tcp_accept(struct socket *so, struct mbuf *nam) 708{ 709 struct inpcb *inp; 710 struct tcpcb *tp; 711 int error; 712 713 soassertlocked(so); 714 715 if ((error = tcp_sogetpcb(so, &inp, &tp))) 716 return (error); 717 718 in_setpeeraddr(inp, nam); 719 720 if (so->so_options & SO_DEBUG) 721 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_ACCEPT, 0); 722 return (0); 723} 724 725/* 726 * Initiate disconnect from peer. 727 * If connection never passed embryonic stage, just drop; 728 * else if don't need to let data drain, then can just drop anyways, 729 * else have to begin TCP shutdown process: mark socket disconnecting, 730 * drain unread data, state switch to reflect user close, and 731 * send segment (e.g. FIN) to peer. Socket will be really disconnected 732 * when peer sends FIN and acks ours. 733 * 734 * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB. 735 */ 736int 737tcp_disconnect(struct socket *so) 738{ 739 struct inpcb *inp; 740 struct tcpcb *tp, *otp = NULL; 741 int error; 742 short ostate; 743 744 soassertlocked(so); 745 746 if ((error = tcp_sogetpcb(so, &inp, &tp))) 747 return (error); 748 749 if (so->so_options & SO_DEBUG) { 750 otp = tp; 751 ostate = tp->t_state; 752 } 753 754 tp = tcp_dodisconnect(tp); 755 756 if (otp) 757 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_DISCONNECT, 0); 758 return (0); 759} 760 761/* 762 * Mark the connection as being incapable of further output. 763 */ 764int 765tcp_shutdown(struct socket *so) 766{ 767 struct inpcb *inp; 768 struct tcpcb *tp, *otp = NULL; 769 int error; 770 short ostate; 771 772 soassertlocked(so); 773 774 if ((error = tcp_sogetpcb(so, &inp, &tp))) 775 return (error); 776 777 if (so->so_options & SO_DEBUG) { 778 otp = tp; 779 ostate = tp->t_state; 780 } 781 782 if (so->so_snd.sb_state & SS_CANTSENDMORE) 783 goto out; 784 785 socantsendmore(so); 786 tp = tcp_usrclosed(tp); 787 if (tp) 788 error = tcp_output(tp); 789 790out: 791 if (otp) 792 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_SHUTDOWN, 0); 793 return (error); 794} 795 796/* 797 * After a receive, possibly send window update to peer. 798 */ 799void 800tcp_rcvd(struct socket *so) 801{ 802 struct inpcb *inp; 803 struct tcpcb *tp; 804 short ostate; 805 806 soassertlocked(so); 807 808 if (tcp_sogetpcb(so, &inp, &tp)) 809 return; 810 811 if (so->so_options & SO_DEBUG) 812 ostate = tp->t_state; 813 814 /* 815 * soreceive() calls this function when a user receives 816 * ancillary data on a listening socket. We don't call 817 * tcp_output in such a case, since there is no header 818 * template for a listening socket and hence the kernel 819 * will panic. 820 */ 821 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) != 0) 822 (void) tcp_output(tp); 823 824 if (so->so_options & SO_DEBUG) 825 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_RCVD, 0); 826} 827 828/* 829 * Do a send by putting data in output queue and updating urgent 830 * marker if URG set. Possibly send more data. 831 */ 832int 833tcp_send(struct socket *so, struct mbuf *m, struct mbuf *nam, 834 struct mbuf *control) 835{ 836 struct inpcb *inp; 837 struct tcpcb *tp; 838 int error; 839 short ostate; 840 841 soassertlocked(so); 842 843 if (control && control->m_len) { 844 error = EINVAL; 845 goto out; 846 } 847 848 if ((error = tcp_sogetpcb(so, &inp, &tp))) 849 goto out; 850 851 if (so->so_options & SO_DEBUG) 852 ostate = tp->t_state; 853 854 mtx_enter(&so->so_snd.sb_mtx); 855 sbappendstream(&so->so_snd, m); 856 mtx_leave(&so->so_snd.sb_mtx); 857 m = NULL; 858 859 error = tcp_output(tp); 860 861 if (so->so_options & SO_DEBUG) 862 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SEND, 0); 863 864out: 865 m_freem(control); 866 m_freem(m); 867 868 return (error); 869} 870 871/* 872 * Abort the TCP. 873 */ 874void 875tcp_abort(struct socket *so) 876{ 877 struct inpcb *inp; 878 struct tcpcb *tp, *otp = NULL; 879 short ostate; 880 881 soassertlocked(so); 882 883 if (tcp_sogetpcb(so, &inp, &tp)) 884 return; 885 886 if (so->so_options & SO_DEBUG) { 887 otp = tp; 888 ostate = tp->t_state; 889 } 890 891 tp = tcp_drop(tp, ECONNABORTED); 892 893 if (otp) 894 tcp_trace(TA_USER, ostate, tp, otp, NULL, PRU_ABORT, 0); 895} 896 897int 898tcp_sense(struct socket *so, struct stat *ub) 899{ 900 struct inpcb *inp; 901 struct tcpcb *tp; 902 int error; 903 904 soassertlocked(so); 905 906 if ((error = tcp_sogetpcb(so, &inp, &tp))) 907 return (error); 908 909 mtx_enter(&so->so_snd.sb_mtx); 910 ub->st_blksize = so->so_snd.sb_hiwat; 911 mtx_leave(&so->so_snd.sb_mtx); 912 913 if (so->so_options & SO_DEBUG) 914 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_SENSE, 0); 915 return (0); 916} 917 918int 919tcp_rcvoob(struct socket *so, struct mbuf *m, int flags) 920{ 921 struct inpcb *inp; 922 struct tcpcb *tp; 923 int error; 924 925 soassertlocked(so); 926 927 if ((error = tcp_sogetpcb(so, &inp, &tp))) 928 return (error); 929 930 if ((so->so_oobmark == 0 && 931 (so->so_rcv.sb_state & SS_RCVATMARK) == 0) || 932 so->so_options & SO_OOBINLINE || 933 tp->t_oobflags & TCPOOB_HADDATA) { 934 error = EINVAL; 935 goto out; 936 } 937 if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) { 938 error = EWOULDBLOCK; 939 goto out; 940 } 941 m->m_len = 1; 942 *mtod(m, caddr_t) = tp->t_iobc; 943 if ((flags & MSG_PEEK) == 0) 944 tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA); 945out: 946 if (so->so_options & SO_DEBUG) 947 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_RCVOOB, 0); 948 return (error); 949} 950 951int 952tcp_sendoob(struct socket *so, struct mbuf *m, struct mbuf *nam, 953 struct mbuf *control) 954{ 955 struct inpcb *inp; 956 struct tcpcb *tp; 957 int error; 958 short ostate; 959 960 soassertlocked(so); 961 962 if (control && control->m_len) { 963 error = EINVAL; 964 goto release; 965 } 966 967 if ((error = tcp_sogetpcb(so, &inp, &tp))) 968 goto release; 969 970 if (so->so_options & SO_DEBUG) 971 ostate = tp->t_state; 972 973 if (sbspace(&so->so_snd) < -512) { 974 error = ENOBUFS; 975 goto out; 976 } 977 978 /* 979 * According to RFC961 (Assigned Protocols), 980 * the urgent pointer points to the last octet 981 * of urgent data. We continue, however, 982 * to consider it to indicate the first octet 983 * of data past the urgent section. 984 * Otherwise, snd_up should be one lower. 985 */ 986 mtx_enter(&so->so_snd.sb_mtx); 987 sbappendstream(&so->so_snd, m); 988 mtx_leave(&so->so_snd.sb_mtx); 989 m = NULL; 990 tp->snd_up = tp->snd_una + so->so_snd.sb_cc; 991 tp->t_force = 1; 992 error = tcp_output(tp); 993 tp->t_force = 0; 994 995out: 996 if (so->so_options & SO_DEBUG) 997 tcp_trace(TA_USER, ostate, tp, tp, NULL, PRU_SENDOOB, 0); 998 999release: 1000 m_freem(control); 1001 m_freem(m); 1002 1003 return (error); 1004} 1005 1006int 1007tcp_sockaddr(struct socket *so, struct mbuf *nam) 1008{ 1009 struct inpcb *inp; 1010 struct tcpcb *tp; 1011 int error; 1012 1013 soassertlocked(so); 1014 1015 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1016 return (error); 1017 1018 in_setsockaddr(inp, nam); 1019 1020 if (so->so_options & SO_DEBUG) 1021 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, 1022 PRU_SOCKADDR, 0); 1023 return (0); 1024} 1025 1026int 1027tcp_peeraddr(struct socket *so, struct mbuf *nam) 1028{ 1029 struct inpcb *inp; 1030 struct tcpcb *tp; 1031 int error; 1032 1033 soassertlocked(so); 1034 1035 if ((error = tcp_sogetpcb(so, &inp, &tp))) 1036 return (error); 1037 1038 in_setpeeraddr(inp, nam); 1039 1040 if (so->so_options & SO_DEBUG) 1041 tcp_trace(TA_USER, tp->t_state, tp, tp, NULL, PRU_PEERADDR, 0); 1042 return (0); 1043} 1044 1045/* 1046 * Initiate (or continue) disconnect. 1047 * If embryonic state, just send reset (once). 1048 * If in ``let data drain'' option and linger null, just drop. 1049 * Otherwise (hard), mark socket disconnecting and drop 1050 * current input data; switch states based on user close, and 1051 * send segment to peer (with FIN). 1052 */ 1053struct tcpcb * 1054tcp_dodisconnect(struct tcpcb *tp) 1055{ 1056 struct socket *so = tp->t_inpcb->inp_socket; 1057 1058 if (TCPS_HAVEESTABLISHED(tp->t_state) == 0) 1059 tp = tcp_close(tp); 1060 else if ((so->so_options & SO_LINGER) && so->so_linger == 0) 1061 tp = tcp_drop(tp, 0); 1062 else { 1063 soisdisconnecting(so); 1064 mtx_enter(&so->so_rcv.sb_mtx); 1065 sbflush(&so->so_rcv); 1066 mtx_leave(&so->so_rcv.sb_mtx); 1067 tp = tcp_usrclosed(tp); 1068 if (tp) 1069 (void) tcp_output(tp); 1070 } 1071 return (tp); 1072} 1073 1074/* 1075 * User issued close, and wish to trail through shutdown states: 1076 * if never received SYN, just forget it. If got a SYN from peer, 1077 * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN. 1078 * If already got a FIN from peer, then almost done; go to LAST_ACK 1079 * state. In all other cases, have already sent FIN to peer (e.g. 1080 * after PRU_SHUTDOWN), and just have to play tedious game waiting 1081 * for peer to send FIN or not respond to keep-alives, etc. 1082 * We can let the user exit from the close as soon as the FIN is acked. 1083 */ 1084struct tcpcb * 1085tcp_usrclosed(struct tcpcb *tp) 1086{ 1087 1088 switch (tp->t_state) { 1089 1090 case TCPS_CLOSED: 1091 case TCPS_LISTEN: 1092 case TCPS_SYN_SENT: 1093 tp->t_state = TCPS_CLOSED; 1094 tp = tcp_close(tp); 1095 break; 1096 1097 case TCPS_SYN_RECEIVED: 1098 case TCPS_ESTABLISHED: 1099 tp->t_state = TCPS_FIN_WAIT_1; 1100 break; 1101 1102 case TCPS_CLOSE_WAIT: 1103 tp->t_state = TCPS_LAST_ACK; 1104 break; 1105 } 1106 if (tp && tp->t_state >= TCPS_FIN_WAIT_2) { 1107 soisdisconnected(tp->t_inpcb->inp_socket); 1108 /* 1109 * If we are in FIN_WAIT_2, we arrived here because the 1110 * application did a shutdown of the send side. Like the 1111 * case of a transition from FIN_WAIT_1 to FIN_WAIT_2 after 1112 * a full close, we start a timer to make sure sockets are 1113 * not left in FIN_WAIT_2 forever. 1114 */ 1115 if (tp->t_state == TCPS_FIN_WAIT_2) { 1116 int maxidle; 1117 1118 maxidle = TCPTV_KEEPCNT * 1119 atomic_load_int(&tcp_keepidle); 1120 TCP_TIMER_ARM(tp, TCPT_2MSL, maxidle); 1121 } 1122 } 1123 return (tp); 1124} 1125 1126/* 1127 * Look up a socket for ident or tcpdrop, ... 1128 */ 1129int 1130tcp_ident(void *oldp, size_t *oldlenp, void *newp, size_t newlen, int dodrop) 1131{ 1132 int error = 0; 1133 struct tcp_ident_mapping tir; 1134 struct inpcb *inp; 1135 struct socket *so = NULL; 1136 struct sockaddr_in *fin, *lin; 1137#ifdef INET6 1138 struct sockaddr_in6 *fin6, *lin6; 1139 struct in6_addr f6, l6; 1140#endif 1141 1142 if (dodrop) { 1143 if (oldp != NULL || *oldlenp != 0) 1144 return (EINVAL); 1145 if (newp == NULL) 1146 return (EPERM); 1147 if (newlen < sizeof(tir)) 1148 return (ENOMEM); 1149 if ((error = copyin(newp, &tir, sizeof (tir))) != 0 ) 1150 return (error); 1151 } else { 1152 if (oldp == NULL) 1153 return (EINVAL); 1154 if (*oldlenp < sizeof(tir)) 1155 return (ENOMEM); 1156 if (newp != NULL || newlen != 0) 1157 return (EINVAL); 1158 if ((error = copyin(oldp, &tir, sizeof (tir))) != 0 ) 1159 return (error); 1160 } 1161 1162 NET_LOCK_SHARED(); 1163 1164 switch (tir.faddr.ss_family) { 1165#ifdef INET6 1166 case AF_INET6: 1167 if (tir.laddr.ss_family != AF_INET6) { 1168 NET_UNLOCK_SHARED(); 1169 return (EAFNOSUPPORT); 1170 } 1171 fin6 = (struct sockaddr_in6 *)&tir.faddr; 1172 error = in6_embedscope(&f6, fin6, NULL, NULL); 1173 if (error) { 1174 NET_UNLOCK_SHARED(); 1175 return EINVAL; /*?*/ 1176 } 1177 lin6 = (struct sockaddr_in6 *)&tir.laddr; 1178 error = in6_embedscope(&l6, lin6, NULL, NULL); 1179 if (error) { 1180 NET_UNLOCK_SHARED(); 1181 return EINVAL; /*?*/ 1182 } 1183 break; 1184#endif 1185 case AF_INET: 1186 if (tir.laddr.ss_family != AF_INET) { 1187 NET_UNLOCK_SHARED(); 1188 return (EAFNOSUPPORT); 1189 } 1190 fin = (struct sockaddr_in *)&tir.faddr; 1191 lin = (struct sockaddr_in *)&tir.laddr; 1192 break; 1193 default: 1194 NET_UNLOCK_SHARED(); 1195 return (EAFNOSUPPORT); 1196 } 1197 1198 switch (tir.faddr.ss_family) { 1199#ifdef INET6 1200 case AF_INET6: 1201 inp = in6_pcblookup(&tcb6table, &f6, 1202 fin6->sin6_port, &l6, lin6->sin6_port, tir.rdomain); 1203 break; 1204#endif 1205 case AF_INET: 1206 inp = in_pcblookup(&tcbtable, fin->sin_addr, 1207 fin->sin_port, lin->sin_addr, lin->sin_port, tir.rdomain); 1208 break; 1209 default: 1210 unhandled_af(tir.faddr.ss_family); 1211 } 1212 1213 if (dodrop) { 1214 struct tcpcb *tp = NULL; 1215 1216 if (inp != NULL) { 1217 so = in_pcbsolock(inp); 1218 if (so != NULL) 1219 tp = intotcpcb(inp); 1220 } 1221 if (tp != NULL && !ISSET(so->so_options, SO_ACCEPTCONN)) 1222 tp = tcp_drop(tp, ECONNABORTED); 1223 else 1224 error = ESRCH; 1225 1226 in_pcbsounlock(inp, so); 1227 NET_UNLOCK_SHARED(); 1228 in_pcbunref(inp); 1229 return (error); 1230 } 1231 1232 if (inp == NULL) { 1233 tcpstat_inc(tcps_pcbhashmiss); 1234 switch (tir.faddr.ss_family) { 1235#ifdef INET6 1236 case AF_INET6: 1237 inp = in6_pcblookup_listen(&tcb6table, 1238 &l6, lin6->sin6_port, NULL, tir.rdomain); 1239 break; 1240#endif 1241 case AF_INET: 1242 inp = in_pcblookup_listen(&tcbtable, 1243 lin->sin_addr, lin->sin_port, NULL, tir.rdomain); 1244 break; 1245 } 1246 } 1247 1248 if (inp != NULL) 1249 so = in_pcbsolock(inp); 1250 1251 if (so != NULL && ISSET(so->so_state, SS_CONNECTOUT)) { 1252 tir.ruid = so->so_ruid; 1253 tir.euid = so->so_euid; 1254 } else { 1255 tir.ruid = -1; 1256 tir.euid = -1; 1257 } 1258 1259 in_pcbsounlock(inp, so); 1260 NET_UNLOCK_SHARED(); 1261 in_pcbunref(inp); 1262 1263 *oldlenp = sizeof(tir); 1264 return copyout(&tir, oldp, sizeof(tir)); 1265} 1266 1267#ifndef SMALL_KERNEL 1268int 1269tcp_sysctl_tcpstat(void *oldp, size_t *oldlenp, void *newp) 1270{ 1271 uint64_t counters[tcps_ncounters]; 1272 struct tcpstat tcpstat; 1273 struct syn_cache_set *set; 1274 int i = 0; 1275 1276#define ASSIGN(field) do { tcpstat.field = counters[i++]; } while (0) 1277 1278 memset(&tcpstat, 0, sizeof tcpstat); 1279 counters_read(tcpcounters, counters, nitems(counters), NULL); 1280 ASSIGN(tcps_connattempt); 1281 ASSIGN(tcps_accepts); 1282 ASSIGN(tcps_connects); 1283 ASSIGN(tcps_drops); 1284 ASSIGN(tcps_conndrops); 1285 ASSIGN(tcps_closed); 1286 ASSIGN(tcps_segstimed); 1287 ASSIGN(tcps_rttupdated); 1288 ASSIGN(tcps_delack); 1289 ASSIGN(tcps_timeoutdrop); 1290 ASSIGN(tcps_rexmttimeo); 1291 ASSIGN(tcps_persisttimeo); 1292 ASSIGN(tcps_persistdrop); 1293 ASSIGN(tcps_keeptimeo); 1294 ASSIGN(tcps_keepprobe); 1295 ASSIGN(tcps_keepdrops); 1296 ASSIGN(tcps_sndtotal); 1297 ASSIGN(tcps_sndpack); 1298 ASSIGN(tcps_sndbyte); 1299 ASSIGN(tcps_sndrexmitpack); 1300 ASSIGN(tcps_sndrexmitbyte); 1301 ASSIGN(tcps_sndrexmitfast); 1302 ASSIGN(tcps_sndacks); 1303 ASSIGN(tcps_sndprobe); 1304 ASSIGN(tcps_sndurg); 1305 ASSIGN(tcps_sndwinup); 1306 ASSIGN(tcps_sndctrl); 1307 ASSIGN(tcps_rcvtotal); 1308 ASSIGN(tcps_rcvpack); 1309 ASSIGN(tcps_rcvbyte); 1310 ASSIGN(tcps_rcvbadsum); 1311 ASSIGN(tcps_rcvbadoff); 1312 ASSIGN(tcps_rcvmemdrop); 1313 ASSIGN(tcps_rcvnosec); 1314 ASSIGN(tcps_rcvshort); 1315 ASSIGN(tcps_rcvduppack); 1316 ASSIGN(tcps_rcvdupbyte); 1317 ASSIGN(tcps_rcvpartduppack); 1318 ASSIGN(tcps_rcvpartdupbyte); 1319 ASSIGN(tcps_rcvoopack); 1320 ASSIGN(tcps_rcvoobyte); 1321 ASSIGN(tcps_rcvpackafterwin); 1322 ASSIGN(tcps_rcvbyteafterwin); 1323 ASSIGN(tcps_rcvafterclose); 1324 ASSIGN(tcps_rcvwinprobe); 1325 ASSIGN(tcps_rcvdupack); 1326 ASSIGN(tcps_rcvacktoomuch); 1327 ASSIGN(tcps_rcvacktooold); 1328 ASSIGN(tcps_rcvackpack); 1329 ASSIGN(tcps_rcvackbyte); 1330 ASSIGN(tcps_rcvwinupd); 1331 ASSIGN(tcps_pawsdrop); 1332 ASSIGN(tcps_predack); 1333 ASSIGN(tcps_preddat); 1334 ASSIGN(tcps_pcbhashmiss); 1335 ASSIGN(tcps_noport); 1336 ASSIGN(tcps_closing); 1337 ASSIGN(tcps_badsyn); 1338 ASSIGN(tcps_dropsyn); 1339 ASSIGN(tcps_rcvbadsig); 1340 ASSIGN(tcps_rcvgoodsig); 1341 ASSIGN(tcps_inswcsum); 1342 ASSIGN(tcps_outswcsum); 1343 ASSIGN(tcps_ecn_accepts); 1344 ASSIGN(tcps_ecn_rcvece); 1345 ASSIGN(tcps_ecn_rcvcwr); 1346 ASSIGN(tcps_ecn_rcvce); 1347 ASSIGN(tcps_ecn_sndect); 1348 ASSIGN(tcps_ecn_sndece); 1349 ASSIGN(tcps_ecn_sndcwr); 1350 ASSIGN(tcps_cwr_ecn); 1351 ASSIGN(tcps_cwr_frecovery); 1352 ASSIGN(tcps_cwr_timeout); 1353 ASSIGN(tcps_sc_added); 1354 ASSIGN(tcps_sc_completed); 1355 ASSIGN(tcps_sc_timed_out); 1356 ASSIGN(tcps_sc_overflowed); 1357 ASSIGN(tcps_sc_reset); 1358 ASSIGN(tcps_sc_unreach); 1359 ASSIGN(tcps_sc_bucketoverflow); 1360 ASSIGN(tcps_sc_aborted); 1361 ASSIGN(tcps_sc_dupesyn); 1362 ASSIGN(tcps_sc_dropped); 1363 ASSIGN(tcps_sc_collisions); 1364 ASSIGN(tcps_sc_retransmitted); 1365 ASSIGN(tcps_sc_seedrandom); 1366 ASSIGN(tcps_sc_hash_size); 1367 ASSIGN(tcps_sc_entry_count); 1368 ASSIGN(tcps_sc_entry_limit); 1369 ASSIGN(tcps_sc_bucket_maxlen); 1370 ASSIGN(tcps_sc_bucket_limit); 1371 ASSIGN(tcps_sc_uses_left); 1372 ASSIGN(tcps_conndrained); 1373 ASSIGN(tcps_sack_recovery_episode); 1374 ASSIGN(tcps_sack_rexmits); 1375 ASSIGN(tcps_sack_rexmit_bytes); 1376 ASSIGN(tcps_sack_rcv_opts); 1377 ASSIGN(tcps_sack_snd_opts); 1378 ASSIGN(tcps_sack_drop_opts); 1379 ASSIGN(tcps_outswtso); 1380 ASSIGN(tcps_outhwtso); 1381 ASSIGN(tcps_outpkttso); 1382 ASSIGN(tcps_outbadtso); 1383 ASSIGN(tcps_inswlro); 1384 ASSIGN(tcps_inhwlro); 1385 ASSIGN(tcps_inpktlro); 1386 ASSIGN(tcps_inbadlro); 1387 1388#undef ASSIGN 1389 1390 mtx_enter(&syn_cache_mtx); 1391 set = &tcp_syn_cache[tcp_syn_cache_active]; 1392 tcpstat.tcps_sc_hash_size = set->scs_size; 1393 tcpstat.tcps_sc_entry_count = set->scs_count; 1394 tcpstat.tcps_sc_entry_limit = atomic_load_int(&tcp_syn_cache_limit); 1395 tcpstat.tcps_sc_bucket_maxlen = 0; 1396 for (i = 0; i < set->scs_size; i++) { 1397 if (tcpstat.tcps_sc_bucket_maxlen < 1398 set->scs_buckethead[i].sch_length) 1399 tcpstat.tcps_sc_bucket_maxlen = 1400 set->scs_buckethead[i].sch_length; 1401 } 1402 tcpstat.tcps_sc_bucket_limit = atomic_load_int(&tcp_syn_bucket_limit); 1403 tcpstat.tcps_sc_uses_left = set->scs_use; 1404 mtx_leave(&syn_cache_mtx); 1405 1406 return (sysctl_rdstruct(oldp, oldlenp, newp, 1407 &tcpstat, sizeof(tcpstat))); 1408} 1409 1410/* 1411 * Sysctl for tcp variables. 1412 */ 1413int 1414tcp_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, 1415 size_t newlen) 1416{ 1417 int error, oval, nval; 1418 1419 /* All sysctl names at this level are terminal. */ 1420 if (namelen != 1) 1421 return (ENOTDIR); 1422 1423 switch (name[0]) { 1424 case TCPCTL_ROOTONLY: 1425 if (newp && (int)atomic_load_int(&securelevel) > 0) 1426 return (EPERM); 1427 /* FALLTHROUGH */ 1428 case TCPCTL_BADDYNAMIC: { 1429 struct baddynamicports *ports = (name[0] == TCPCTL_ROOTONLY ? 1430 &rootonlyports : &baddynamicports); 1431 const size_t bufitems = DP_MAPSIZE; 1432 const size_t buflen = bufitems * sizeof(uint32_t); 1433 size_t i; 1434 uint32_t *buf; 1435 int error; 1436 1437 buf = malloc(buflen, M_SYSCTL, M_WAITOK | M_ZERO); 1438 1439 NET_LOCK_SHARED(); 1440 for (i = 0; i < bufitems; ++i) 1441 buf[i] = ports->tcp[i]; 1442 NET_UNLOCK_SHARED(); 1443 1444 error = sysctl_struct(oldp, oldlenp, newp, newlen, 1445 buf, buflen); 1446 1447 if (error == 0 && newp) { 1448 NET_LOCK(); 1449 for (i = 0; i < bufitems; ++i) 1450 ports->tcp[i] = buf[i]; 1451 NET_UNLOCK(); 1452 } 1453 1454 free(buf, M_SYSCTL, buflen); 1455 1456 return (error); 1457 } 1458 case TCPCTL_IDENT: 1459 return tcp_ident(oldp, oldlenp, newp, newlen, 0); 1460 1461 case TCPCTL_DROP: 1462 return tcp_ident(oldp, oldlenp, newp, newlen, 1); 1463 1464 case TCPCTL_REASS_LIMIT: 1465 case TCPCTL_SACKHOLE_LIMIT: { 1466 struct pool *pool; 1467 int *var; 1468 1469 if (name[0] == TCPCTL_REASS_LIMIT) { 1470 pool = &tcpqe_pool; 1471 var = &tcp_reass_limit; 1472 } else { 1473 pool = &sackhl_pool; 1474 var = &tcp_sackhole_limit; 1475 } 1476 1477 oval = nval = atomic_load_int(var); 1478 error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); 1479 1480 if (error == 0 && oval != nval) { 1481 extern struct rwlock sysctl_lock; 1482 1483 error = rw_enter(&sysctl_lock, RW_WRITE | RW_INTR); 1484 if (error) 1485 return (error); 1486 if (nval != atomic_load_int(var)) { 1487 error = pool_sethardlimit(pool, nval); 1488 if (error == 0) 1489 atomic_store_int(var, nval); 1490 } 1491 rw_exit(&sysctl_lock); 1492 } 1493 1494 return (error); 1495 } 1496 case TCPCTL_STATS: 1497 return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); 1498 1499 case TCPCTL_SYN_USE_LIMIT: 1500 oval = nval = atomic_load_int(&tcp_syn_use_limit); 1501 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1502 &nval, 0, INT_MAX); 1503 if (!error && oval != nval) { 1504 /* 1505 * Global tcp_syn_use_limit is used when reseeding a 1506 * new cache. Also update the value in active cache. 1507 */ 1508 mtx_enter(&syn_cache_mtx); 1509 if (tcp_syn_cache[0].scs_use > nval) 1510 tcp_syn_cache[0].scs_use = nval; 1511 if (tcp_syn_cache[1].scs_use > nval) 1512 tcp_syn_cache[1].scs_use = nval; 1513 tcp_syn_use_limit = nval; 1514 mtx_leave(&syn_cache_mtx); 1515 } 1516 return (error); 1517 1518 case TCPCTL_SYN_HASH_SIZE: 1519 oval = nval = atomic_load_int(&tcp_syn_hash_size); 1520 error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 1521 &nval, 1, 100000); 1522 if (!error && oval != nval) { 1523 /* 1524 * If global hash size has been changed, 1525 * switch sets as soon as possible. Then 1526 * the actual hash array will be reallocated. 1527 */ 1528 mtx_enter(&syn_cache_mtx); 1529 if (tcp_syn_cache[0].scs_size != nval) 1530 tcp_syn_cache[0].scs_use = 0; 1531 if (tcp_syn_cache[1].scs_size != nval) 1532 tcp_syn_cache[1].scs_use = 0; 1533 tcp_syn_hash_size = nval; 1534 mtx_leave(&syn_cache_mtx); 1535 } 1536 return (error); 1537 1538 default: 1539 error = sysctl_bounded_arr(tcpctl_vars, nitems(tcpctl_vars), 1540 name, namelen, oldp, oldlenp, newp, newlen); 1541 switch (name[0]) { 1542 case TCPCTL_KEEPINITTIME: 1543 atomic_store_int(&tcp_keepinit, 1544 atomic_load_int(&tcp_keepinit_sec) * TCP_TIME(1)); 1545 break; 1546 case TCPCTL_KEEPIDLE: 1547 atomic_store_int(&tcp_keepidle, 1548 atomic_load_int(&tcp_keepidle_sec) * TCP_TIME(1)); 1549 break; 1550 case TCPCTL_KEEPINTVL: 1551 atomic_store_int(&tcp_keepintvl, 1552 atomic_load_int(&tcp_keepintvl_sec) * TCP_TIME(1)); 1553 break; 1554 } 1555 return (error); 1556 } 1557 /* NOTREACHED */ 1558} 1559#endif /* SMALL_KERNEL */ 1560 1561/* 1562 * Scale the send buffer so that inflight data is not accounted against 1563 * the limit. The buffer will scale with the congestion window, if the 1564 * the receiver stops acking data the window will shrink and therefore 1565 * the buffer size will shrink as well. 1566 * In low memory situation try to shrink the buffer to the initial size 1567 * disabling the send buffer scaling as long as the situation persists. 1568 */ 1569void 1570tcp_update_sndspace(struct tcpcb *tp) 1571{ 1572 struct socket *so = tp->t_inpcb->inp_socket; 1573 u_long nmax; 1574 1575 mtx_enter(&so->so_snd.sb_mtx); 1576 1577 nmax = so->so_snd.sb_hiwat; 1578 1579 if (sbchecklowmem()) { 1580 /* low on memory try to get rid of some */ 1581 if (tcp_sendspace < nmax) 1582 nmax = tcp_sendspace; 1583 } else if (so->so_snd.sb_wat != tcp_sendspace) { 1584 /* user requested buffer size, auto-scaling disabled */ 1585 nmax = so->so_snd.sb_wat; 1586 } else { 1587 /* automatic buffer scaling */ 1588 nmax = MIN(sb_max, so->so_snd.sb_wat + tp->snd_max - 1589 tp->snd_una); 1590 } 1591 1592 /* a writable socket must be preserved because of poll(2) semantics */ 1593 if (sbspace_locked(&so->so_snd) >= so->so_snd.sb_lowat) { 1594 if (nmax < so->so_snd.sb_cc + so->so_snd.sb_lowat) 1595 nmax = so->so_snd.sb_cc + so->so_snd.sb_lowat; 1596 /* keep in sync with sbreserve() calculation */ 1597 if (nmax * 8 < so->so_snd.sb_mbcnt + so->so_snd.sb_lowat) 1598 nmax = (so->so_snd.sb_mbcnt+so->so_snd.sb_lowat+7) / 8; 1599 } 1600 1601 /* round to MSS boundary */ 1602 nmax = roundup(nmax, tp->t_maxseg); 1603 1604 if (nmax != so->so_snd.sb_hiwat) 1605 sbreserve(&so->so_snd, nmax); 1606 1607 mtx_leave(&so->so_snd.sb_mtx); 1608} 1609 1610/* 1611 * Scale the recv buffer by looking at how much data was transferred in 1612 * one approximated RTT. If more than a big part of the recv buffer was 1613 * transferred during that time we increase the buffer by a constant. 1614 * In low memory situation try to shrink the buffer to the initial size. 1615 */ 1616void 1617tcp_update_rcvspace(struct tcpcb *tp) 1618{ 1619 struct socket *so = tp->t_inpcb->inp_socket; 1620 u_long nmax; 1621 1622 mtx_enter(&so->so_rcv.sb_mtx); 1623 1624 nmax = so->so_rcv.sb_hiwat; 1625 1626 if (sbchecklowmem()) { 1627 /* low on memory try to get rid of some */ 1628 if (tcp_recvspace < nmax) 1629 nmax = tcp_recvspace; 1630 } else if (so->so_rcv.sb_wat != tcp_recvspace) { 1631 /* user requested buffer size, auto-scaling disabled */ 1632 nmax = so->so_rcv.sb_wat; 1633 } else { 1634 /* automatic buffer scaling */ 1635 if (tp->rfbuf_cnt > so->so_rcv.sb_hiwat / 8 * 7) 1636 nmax = MIN(sb_max, so->so_rcv.sb_hiwat + 1637 tcp_autorcvbuf_inc); 1638 } 1639 1640 /* a readable socket must be preserved because of poll(2) semantics */ 1641 mtx_enter(&so->so_snd.sb_mtx); 1642 if (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat && 1643 nmax < so->so_snd.sb_lowat) 1644 nmax = so->so_snd.sb_lowat; 1645 mtx_leave(&so->so_snd.sb_mtx); 1646 1647 if (nmax != so->so_rcv.sb_hiwat) { 1648 /* round to MSS boundary */ 1649 nmax = roundup(nmax, tp->t_maxseg); 1650 sbreserve(&so->so_rcv, nmax); 1651 } 1652 1653 mtx_leave(&so->so_rcv.sb_mtx); 1654}