sys/net/bpf.c at jcs · jcs.org/openbsd-src

jcs.org / openbsd-src
fork atom
jcs's openbsd hax
openbsd
fork atom
openbsd-src / sys / net / bpf.c
at jcs 1998 lines 42 kB view raw
wrap content
deraadt Do not allow root to bypass BIOCLOCK ok dlg florian, plus discussion with guy harris 4mo ago
ef50aed9
   1/*	$OpenBSD: bpf.c,v 1.235 2025/11/13 10:53:25 deraadt Exp $	*/
   2/*	$NetBSD: bpf.c,v 1.33 1997/02/21 23:59:35 thorpej Exp $	*/
   3
   4/*
   5 * Copyright (c) 1990, 1991, 1993
   6 *	The Regents of the University of California.  All rights reserved.
   7 * Copyright (c) 2010, 2014 Henning Brauer <henning@openbsd.org>
   8 *
   9 * This code is derived from the Stanford/CMU enet packet filter,
  10 * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  11 * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  12 * Berkeley Laboratory.
  13 *
  14 * Redistribution and use in source and binary forms, with or without
  15 * modification, are permitted provided that the following conditions
  16 * are met:
  17 * 1. Redistributions of source code must retain the above copyright
  18 *    notice, this list of conditions and the following disclaimer.
  19 * 2. Redistributions in binary form must reproduce the above copyright
  20 *    notice, this list of conditions and the following disclaimer in the
  21 *    documentation and/or other materials provided with the distribution.
  22 * 3. Neither the name of the University nor the names of its contributors
  23 *    may be used to endorse or promote products derived from this software
  24 *    without specific prior written permission.
  25 *
  26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  29 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  36 * SUCH DAMAGE.
  37 *
  38 *	@(#)bpf.c	8.2 (Berkeley) 3/28/94
  39 */
  40
  41#include <sys/param.h>
  42#include <sys/systm.h>
  43#include <sys/mbuf.h>
  44#include <sys/timeout.h>
  45#include <sys/signalvar.h>
  46#include <sys/ioctl.h>
  47#include <sys/conf.h>
  48#include <sys/vnode.h>
  49#include <sys/socket.h>
  50#include <sys/sysctl.h>
  51#include <sys/atomic.h>
  52#include <sys/event.h>
  53#include <sys/mutex.h>
  54#include <sys/refcnt.h>
  55#include <sys/smr.h>
  56#include <sys/specdev.h>
  57#include <sys/sigio.h>
  58#include <sys/task.h>
  59#include <sys/time.h>
  60
  61#include <net/if.h>
  62#include <net/bpf.h>
  63#include <net/bpfdesc.h>
  64
  65#include <netinet/in.h>
  66#include <netinet/if_ether.h>
  67
  68#include "vlan.h"
  69
  70#define BPF_BUFSIZE 32768
  71
  72#define BPF_S_IDLE	0
  73#define BPF_S_WAIT	1
  74#define BPF_S_DONE	2
  75
  76#define PRINET  26			/* interruptible */
  77
  78/*
  79 * Locks used to protect data:
  80 *	a	atomic
  81 */
  82
  83/*
  84 * The default read buffer size is patchable.
  85 */
  86int bpf_bufsize = BPF_BUFSIZE;		/* [a] */
  87int bpf_maxbufsize = BPF_MAXBUFSIZE;	/* [a] */
  88
  89/*
  90 *  bpf_iflist is the list of interfaces; each corresponds to an ifnet
  91 *  bpf_d_list is the list of descriptors
  92 */
  93TAILQ_HEAD(, bpf_if) bpf_iflist = TAILQ_HEAD_INITIALIZER(bpf_iflist);
  94LIST_HEAD(, bpf_d) bpf_d_list = LIST_HEAD_INITIALIZER(bpf_d_list);
  95
  96int	bpf_allocbufs(struct bpf_d *);
  97void	bpf_ifname(struct bpf_if*, struct ifreq *);
  98void	bpf_mcopy(const void *, void *, size_t);
  99int	bpf_movein(struct uio *, struct bpf_d *, struct mbuf **,
 100	    struct sockaddr *);
 101int	bpf_setif(struct bpf_d *, struct ifreq *);
 102int	bpfkqfilter(dev_t, struct knote *);
 103void	bpf_wakeup(struct bpf_d *);
 104void	bpf_wakeup_cb(void *);
 105void	bpf_wait_cb(void *);
 106int	_bpf_mtap(caddr_t, const struct mbuf *, const struct mbuf *, u_int);
 107void	bpf_catchpacket(struct bpf_d *, u_char *, size_t, size_t,
 108	    const struct bpf_hdr *);
 109int	bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 110int	bpf_setdlt(struct bpf_d *, u_int);
 111
 112void	filt_bpfrdetach(struct knote *);
 113int	filt_bpfread(struct knote *, long);
 114int	filt_bpfreadmodify(struct kevent *, struct knote *);
 115int	filt_bpfreadprocess(struct knote *, struct kevent *);
 116
 117struct bpf_d *bpfilter_lookup(int);
 118
 119/*
 120 * Called holding ``bd_mtx''.
 121 */
 122void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 123void	bpf_detachd(struct bpf_d *);
 124void	bpf_resetd(struct bpf_d *);
 125
 126void	bpf_prog_smr(void *);
 127void	bpf_d_smr(void *);
 128
 129/*
 130 * Reference count access to descriptor buffers
 131 */
 132void	bpf_get(struct bpf_d *);
 133void	bpf_put(struct bpf_d *);
 134
 135int
 136bpf_movein(struct uio *uio, struct bpf_d *d, struct mbuf **mp,
 137    struct sockaddr *sockp)
 138{
 139	struct bpf_program_smr *bps;
 140	struct bpf_insn *fcode = NULL;
 141	struct mbuf *m;
 142	struct m_tag *mtag;
 143	int error;
 144	u_int hlen, alen, mlen;
 145	u_int len;
 146	u_int linktype;
 147	u_int slen;
 148
 149	/*
 150	 * Build a sockaddr based on the data link layer type.
 151	 * We do this at this level because the ethernet header
 152	 * is copied directly into the data field of the sockaddr.
 153	 * In the case of SLIP, there is no header and the packet
 154	 * is forwarded as is.
 155	 * Also, we are careful to leave room at the front of the mbuf
 156	 * for the link level header.
 157	 */
 158	linktype = d->bd_bif->bif_dlt;
 159	switch (linktype) {
 160
 161	case DLT_SLIP:
 162		sockp->sa_family = AF_INET;
 163		hlen = 0;
 164		break;
 165
 166	case DLT_PPP:
 167		sockp->sa_family = AF_UNSPEC;
 168		hlen = 0;
 169		break;
 170
 171	case DLT_EN10MB:
 172		sockp->sa_family = AF_UNSPEC;
 173		/* XXX Would MAXLINKHDR be better? */
 174		hlen = ETHER_HDR_LEN;
 175		break;
 176
 177	case DLT_IEEE802_11:
 178	case DLT_IEEE802_11_RADIO:
 179		sockp->sa_family = AF_UNSPEC;
 180		hlen = 0;
 181		break;
 182
 183	case DLT_RAW:
 184	case DLT_NULL:
 185		sockp->sa_family = AF_UNSPEC;
 186		hlen = 0;
 187		break;
 188
 189	case DLT_LOOP:
 190		sockp->sa_family = AF_UNSPEC;
 191		hlen = sizeof(u_int32_t);
 192		break;
 193
 194	default:
 195		return (EIO);
 196	}
 197
 198	if (uio->uio_resid > MAXMCLBYTES)
 199		return (EMSGSIZE);
 200	len = uio->uio_resid;
 201	if (len < hlen)
 202		return (EINVAL);
 203
 204	/*
 205	 * Get the length of the payload so we can align it properly.
 206	 */
 207	alen = len - hlen;
 208
 209	/*
 210	 * Allocate enough space for headers and the aligned payload.
 211	 */
 212	mlen = max(max_linkhdr, hlen) + roundup(alen, sizeof(long));
 213	if (mlen > MAXMCLBYTES)
 214		return (EMSGSIZE);
 215
 216	MGETHDR(m, M_WAIT, MT_DATA);
 217	if (mlen > MHLEN) {
 218		MCLGETL(m, M_WAIT, mlen);
 219		if ((m->m_flags & M_EXT) == 0) {
 220			error = ENOBUFS;
 221			goto bad;
 222		}
 223	}
 224
 225	m_align(m, alen); /* Align the payload. */
 226	m->m_data -= hlen;
 227
 228	m->m_pkthdr.ph_ifidx = 0;
 229	m->m_pkthdr.len = len;
 230	m->m_len = len;
 231
 232	error = uiomove(mtod(m, caddr_t), len, uio);
 233	if (error)
 234		goto bad;
 235
 236	smr_read_enter();
 237	bps = SMR_PTR_GET(&d->bd_wfilter);
 238	if (bps != NULL)
 239		fcode = bps->bps_bf.bf_insns;
 240	slen = bpf_filter(fcode, mtod(m, u_char *), len, len);
 241	smr_read_leave();
 242
 243	if (slen < len) {
 244		error = EPERM;
 245		goto bad;
 246	}
 247
 248	/*
 249	 * Make room for link header, and copy it to sockaddr
 250	 */
 251	if (hlen != 0) {
 252		if (linktype == DLT_LOOP) {
 253			u_int32_t af;
 254
 255			/* the link header indicates the address family */
 256			KASSERT(hlen == sizeof(u_int32_t));
 257			memcpy(&af, m->m_data, hlen);
 258			sockp->sa_family = ntohl(af);
 259		} else
 260			memcpy(sockp->sa_data, m->m_data, hlen);
 261
 262		m->m_pkthdr.len -= hlen;
 263		m->m_len -= hlen;
 264		m->m_data += hlen;
 265	}
 266
 267	/*
 268	 * Prepend the data link type as a mbuf tag
 269	 */
 270	mtag = m_tag_get(PACKET_TAG_DLT, sizeof(u_int), M_WAIT);
 271	*(u_int *)(mtag + 1) = linktype;
 272	m_tag_prepend(m, mtag);
 273
 274	*mp = m;
 275	return (0);
 276 bad:
 277	m_freem(m);
 278	return (error);
 279}
 280
 281/*
 282 * Attach file to the bpf interface, i.e. make d listen on bp.
 283 */
 284void
 285bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 286{
 287	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
 288
 289	/*
 290	 * Point d at bp, and add d to the interface's list of listeners.
 291	 * Finally, point the driver's bpf cookie at the interface so
 292	 * it will divert packets to bpf.
 293	 */
 294
 295	d->bd_bif = bp;
 296
 297	KERNEL_ASSERT_LOCKED();
 298	SMR_SLIST_INSERT_HEAD_LOCKED(&bp->bif_dlist, d, bd_next);
 299
 300	*bp->bif_driverp = bp;
 301}
 302
 303/*
 304 * Detach a file from its interface.
 305 */
 306void
 307bpf_detachd(struct bpf_d *d)
 308{
 309	struct bpf_if *bp;
 310
 311	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
 312
 313	bp = d->bd_bif;
 314	/* Not attached. */
 315	if (bp == NULL)
 316		return;
 317
 318	/* Remove ``d'' from the interface's descriptor list. */
 319	KERNEL_ASSERT_LOCKED();
 320	SMR_SLIST_REMOVE_LOCKED(&bp->bif_dlist, d, bpf_d, bd_next);
 321
 322	if (SMR_SLIST_EMPTY_LOCKED(&bp->bif_dlist)) {
 323		/*
 324		 * Let the driver know that there are no more listeners.
 325		 */
 326		*bp->bif_driverp = NULL;
 327	}
 328
 329	d->bd_bif = NULL;
 330
 331	/*
 332	 * Check if this descriptor had requested promiscuous mode.
 333	 * If so, turn it off.
 334	 */
 335	if (d->bd_promisc) {
 336		int error;
 337
 338		KASSERT(bp->bif_ifp != NULL);
 339
 340		d->bd_promisc = 0;
 341
 342		bpf_get(d);
 343		mtx_leave(&d->bd_mtx);
 344		NET_LOCK();
 345		error = ifpromisc(bp->bif_ifp, 0);
 346		NET_UNLOCK();
 347		mtx_enter(&d->bd_mtx);
 348		bpf_put(d);
 349
 350		if (error && !(error == EINVAL || error == ENODEV ||
 351		    error == ENXIO))
 352			/*
 353			 * Something is really wrong if we were able to put
 354			 * the driver into promiscuous mode, but can't
 355			 * take it out.
 356			 */
 357			panic("bpf: ifpromisc failed");
 358	}
 359}
 360
 361void
 362bpfilterattach(int n)
 363{
 364}
 365
 366/*
 367 * Open ethernet device.  Returns ENXIO for illegal minor device number,
 368 * EBUSY if file is open by another process.
 369 */
 370int
 371bpfopen(dev_t dev, int flag, int mode, struct proc *p)
 372{
 373	struct bpf_d *bd;
 374	int unit = minor(dev);
 375
 376	if (unit & ((1 << CLONE_SHIFT) - 1))
 377		return (ENXIO);
 378
 379	KASSERT(bpfilter_lookup(unit) == NULL);
 380
 381	/* create on demand */
 382	if ((bd = malloc(sizeof(*bd), M_DEVBUF, M_NOWAIT|M_ZERO)) == NULL)
 383		return (EBUSY);
 384
 385	/* Mark "free" and do most initialization. */
 386	bd->bd_unit = unit;
 387	bd->bd_bufsize = atomic_load_int(&bpf_bufsize);
 388	bd->bd_sig = SIGIO;
 389	mtx_init(&bd->bd_mtx, IPL_NET);
 390	task_set(&bd->bd_wake_task, bpf_wakeup_cb, bd);
 391	timeout_set(&bd->bd_wait_tmo, bpf_wait_cb, bd);
 392	smr_init(&bd->bd_smr);
 393	sigio_init(&bd->bd_sigio);
 394	klist_init_mutex(&bd->bd_klist, &bd->bd_mtx);
 395
 396	bd->bd_rtout = 0;	/* no timeout by default */
 397	bd->bd_wtout = INFSLP;	/* wait for the buffer to fill by default */
 398
 399	refcnt_init(&bd->bd_refcnt);
 400	LIST_INSERT_HEAD(&bpf_d_list, bd, bd_list);
 401
 402	return (0);
 403}
 404
 405/*
 406 * Close the descriptor by detaching it from its interface,
 407 * deallocating its buffers, and marking it free.
 408 */
 409int
 410bpfclose(dev_t dev, int flag, int mode, struct proc *p)
 411{
 412	struct bpf_d *d;
 413
 414	d = bpfilter_lookup(minor(dev));
 415	mtx_enter(&d->bd_mtx);
 416	bpf_detachd(d);
 417	bpf_wakeup(d);
 418	LIST_REMOVE(d, bd_list);
 419	mtx_leave(&d->bd_mtx);
 420	bpf_put(d);
 421
 422	return (0);
 423}
 424
 425/*
 426 * Rotate the packet buffers in descriptor d.  Move the store buffer
 427 * into the hold slot, and the free buffer into the store slot.
 428 * Zero the length of the new store buffer.
 429 */
 430#define ROTATE_BUFFERS(d) \
 431	KASSERT(d->bd_in_uiomove == 0); \
 432	MUTEX_ASSERT_LOCKED(&d->bd_mtx); \
 433	(d)->bd_hbuf = (d)->bd_sbuf; \
 434	(d)->bd_hlen = (d)->bd_slen; \
 435	(d)->bd_sbuf = (d)->bd_fbuf; \
 436	(d)->bd_state = BPF_S_IDLE; \
 437	(d)->bd_slen = 0; \
 438	(d)->bd_fbuf = NULL;
 439
 440/*
 441 *  bpfread - read next chunk of packets from buffers
 442 */
 443int
 444bpfread(dev_t dev, struct uio *uio, int ioflag)
 445{
 446	uint64_t end, now;
 447	struct bpf_d *d;
 448	caddr_t hbuf;
 449	int error, hlen;
 450
 451	KERNEL_ASSERT_LOCKED();
 452
 453	d = bpfilter_lookup(minor(dev));
 454	if (d->bd_bif == NULL)
 455		return (ENXIO);
 456
 457	bpf_get(d);
 458	mtx_enter(&d->bd_mtx);
 459
 460	/*
 461	 * Restrict application to use a buffer the same size as
 462	 * as kernel buffers.
 463	 */
 464	if (uio->uio_resid != d->bd_bufsize) {
 465		error = EINVAL;
 466		goto out;
 467	}
 468
 469	/*
 470	 * If there's a timeout, mark when the read should end.
 471	 */
 472	if (d->bd_rtout != 0) {
 473		now = nsecuptime();
 474		end = now + d->bd_rtout;
 475		if (end < now)
 476			end = UINT64_MAX;
 477	}
 478
 479	/*
 480	 * If the hold buffer is empty, then do a timed sleep, which
 481	 * ends when the timeout expires or when enough packets
 482	 * have arrived to fill the store buffer.
 483	 */
 484	while (d->bd_hbuf == NULL) {
 485		if (d->bd_bif == NULL) {
 486			/* interface is gone */
 487			if (d->bd_slen == 0) {
 488				error = EIO;
 489				goto out;
 490			}
 491			ROTATE_BUFFERS(d);
 492			break;
 493		}
 494		if (d->bd_state == BPF_S_DONE) {
 495			/*
 496			 * A packet(s) either arrived since the previous
 497			 * read or arrived while we were asleep.
 498			 * Rotate the buffers and return what's here.
 499			 */
 500			ROTATE_BUFFERS(d);
 501			break;
 502		}
 503		if (ISSET(ioflag, IO_NDELAY)) {
 504			/* User requested non-blocking I/O */
 505			error = EWOULDBLOCK;
 506		} else if (d->bd_rtout == 0) {
 507			/* No read timeout set. */
 508			d->bd_nreaders++;
 509			error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH,
 510			    "bpf", INFSLP);
 511			d->bd_nreaders--;
 512		} else if ((now = nsecuptime()) < end) {
 513			/* Read timeout has not expired yet. */
 514			d->bd_nreaders++;
 515			error = msleep_nsec(d, &d->bd_mtx, PRINET|PCATCH,
 516			    "bpf", end - now);
 517			d->bd_nreaders--;
 518		} else {
 519			/* Read timeout has expired. */
 520			error = EWOULDBLOCK;
 521		}
 522		if (error == EINTR || error == ERESTART)
 523			goto out;
 524		if (error == EWOULDBLOCK) {
 525			/*
 526			 * On a timeout, return what's in the buffer,
 527			 * which may be nothing.  If there is something
 528			 * in the store buffer, we can rotate the buffers.
 529			 */
 530			if (d->bd_hbuf != NULL)
 531				/*
 532				 * We filled up the buffer in between
 533				 * getting the timeout and arriving
 534				 * here, so we don't need to rotate.
 535				 */
 536				break;
 537
 538			if (d->bd_slen == 0) {
 539				error = 0;
 540				goto out;
 541			}
 542			ROTATE_BUFFERS(d);
 543			break;
 544		}
 545	}
 546	/*
 547	 * At this point, we know we have something in the hold slot.
 548	 */
 549	hbuf = d->bd_hbuf;
 550	hlen = d->bd_hlen;
 551	d->bd_hbuf = NULL;
 552	d->bd_hlen = 0;
 553	d->bd_fbuf = NULL;
 554	d->bd_in_uiomove = 1;
 555
 556	/*
 557	 * Move data from hold buffer into user space.
 558	 * We know the entire buffer is transferred since
 559	 * we checked above that the read buffer is bpf_bufsize bytes.
 560	 */
 561	mtx_leave(&d->bd_mtx);
 562	error = uiomove(hbuf, hlen, uio);
 563	mtx_enter(&d->bd_mtx);
 564
 565	/* Ensure that bpf_resetd() or ROTATE_BUFFERS() haven't been called. */
 566	KASSERT(d->bd_fbuf == NULL);
 567	KASSERT(d->bd_hbuf == NULL);
 568	d->bd_fbuf = hbuf;
 569	d->bd_in_uiomove = 0;
 570out:
 571	mtx_leave(&d->bd_mtx);
 572	bpf_put(d);
 573
 574	return (error);
 575}
 576
 577/*
 578 * If there are processes sleeping on this descriptor, wake them up.
 579 */
 580void
 581bpf_wakeup(struct bpf_d *d)
 582{
 583	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
 584
 585	if (d->bd_nreaders)
 586		wakeup(d);
 587
 588	knote_locked(&d->bd_klist, 0);
 589
 590	/*
 591	 * As long as pgsigio() needs to be protected
 592	 * by the KERNEL_LOCK() we have to delay the wakeup to
 593	 * another context to keep the hot path KERNEL_LOCK()-free.
 594	 */
 595	if (d->bd_async && d->bd_sig) {
 596		bpf_get(d);
 597		if (!task_add(systq, &d->bd_wake_task))
 598			bpf_put(d);
 599	}
 600}
 601
 602void
 603bpf_wakeup_cb(void *xd)
 604{
 605	struct bpf_d *d = xd;
 606
 607	if (d->bd_async && d->bd_sig)
 608		pgsigio(&d->bd_sigio, d->bd_sig, 0);
 609
 610	bpf_put(d);
 611}
 612
 613void
 614bpf_wait_cb(void *xd)
 615{
 616	struct bpf_d *d = xd;
 617
 618	mtx_enter(&d->bd_mtx);
 619	if (d->bd_state == BPF_S_WAIT) {
 620		d->bd_state = BPF_S_DONE;
 621		bpf_wakeup(d);
 622	}
 623	mtx_leave(&d->bd_mtx);
 624
 625	bpf_put(d);
 626}
 627
 628int
 629bpfwrite(dev_t dev, struct uio *uio, int ioflag)
 630{
 631	struct bpf_d *d;
 632	struct ifnet *ifp;
 633	struct mbuf *m;
 634	int error;
 635	struct sockaddr_storage dst;
 636
 637	KERNEL_ASSERT_LOCKED();
 638
 639	d = bpfilter_lookup(minor(dev));
 640	if (d->bd_bif == NULL)
 641		return (ENXIO);
 642
 643	bpf_get(d);
 644	ifp = d->bd_bif->bif_ifp;
 645
 646	if (ifp == NULL || (ifp->if_flags & IFF_UP) == 0) {
 647		error = ENETDOWN;
 648		goto out;
 649	}
 650
 651	if (uio->uio_resid == 0) {
 652		error = 0;
 653		goto out;
 654	}
 655
 656	error = bpf_movein(uio, d, &m, sstosa(&dst));
 657	if (error)
 658		goto out;
 659
 660	if (m->m_pkthdr.len > ifp->if_mtu) {
 661		m_freem(m);
 662		error = EMSGSIZE;
 663		goto out;
 664	}
 665
 666	m->m_pkthdr.ph_rtableid = ifp->if_rdomain;
 667	m->m_pkthdr.pf.prio = ifp->if_llprio;
 668
 669	if (d->bd_hdrcmplt && dst.ss_family == AF_UNSPEC)
 670		dst.ss_family = pseudo_AF_HDRCMPLT;
 671
 672	NET_LOCK();
 673	error = ifp->if_output(ifp, m, sstosa(&dst), NULL);
 674	NET_UNLOCK();
 675
 676out:
 677	bpf_put(d);
 678	return (error);
 679}
 680
 681/*
 682 * Reset a descriptor by flushing its packet buffer and clearing the
 683 * receive and drop counts.
 684 */
 685void
 686bpf_resetd(struct bpf_d *d)
 687{
 688	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
 689	KASSERT(d->bd_in_uiomove == 0);
 690
 691	if (timeout_del(&d->bd_wait_tmo))
 692		bpf_put(d);
 693
 694	if (d->bd_hbuf != NULL) {
 695		/* Free the hold buffer. */
 696		d->bd_fbuf = d->bd_hbuf;
 697		d->bd_hbuf = NULL;
 698	}
 699	d->bd_state = BPF_S_IDLE;
 700	d->bd_slen = 0;
 701	d->bd_hlen = 0;
 702	d->bd_rcount = 0;
 703	d->bd_dcount = 0;
 704}
 705
 706static int
 707bpf_set_wtout(struct bpf_d *d, uint64_t wtout)
 708{
 709	mtx_enter(&d->bd_mtx);
 710	d->bd_wtout = wtout;
 711	mtx_leave(&d->bd_mtx);
 712
 713	return (0);
 714}
 715
 716static int
 717bpf_set_wtimeout(struct bpf_d *d, const struct timeval *tv)
 718{
 719	uint64_t nsec;
 720
 721	if (tv->tv_sec < 0 || !timerisvalid(tv))
 722		return (EINVAL);
 723
 724	nsec = TIMEVAL_TO_NSEC(tv);
 725	if (nsec > SEC_TO_NSEC(300))
 726		return (EINVAL);
 727	if (nsec > MAXTSLP)
 728		return (EOVERFLOW);
 729
 730	return (bpf_set_wtout(d, nsec));
 731}
 732
 733static int
 734bpf_get_wtimeout(struct bpf_d *d, struct timeval *tv)
 735{
 736	uint64_t nsec;
 737
 738	mtx_enter(&d->bd_mtx);
 739	nsec = d->bd_wtout;
 740	mtx_leave(&d->bd_mtx);
 741
 742	if (nsec == INFSLP)
 743		return (ENXIO);
 744
 745	memset(tv, 0, sizeof(*tv));
 746	NSEC_TO_TIMEVAL(nsec, tv);
 747
 748	return (0);
 749}
 750
 751/*
 752 *  FIONREAD		Check for read packet available.
 753 *  BIOCGBLEN		Get buffer len [for read()].
 754 *  BIOCSETF		Set read filter.
 755 *  BIOCSETFNR          Set read filter without resetting descriptor.
 756 *  BIOCFLUSH		Flush read packet buffer.
 757 *  BIOCPROMISC		Put interface into promiscuous mode.
 758 *  BIOCGDLTLIST	Get supported link layer types.
 759 *  BIOCGDLT		Get link layer type.
 760 *  BIOCSDLT		Set link layer type.
 761 *  BIOCGETIF		Get interface name.
 762 *  BIOCSETIF		Set interface.
 763 *  BIOCSRTIMEOUT	Set read timeout.
 764 *  BIOCGRTIMEOUT	Get read timeout.
 765 *  BIOCSWTIMEOUT	Set wait timeout.
 766 *  BIOCGWTIMEOUT	Get wait timeout.
 767 *  BIOCDWTIMEOUT	Del wait timeout.
 768 *  BIOCGSTATS		Get packet stats.
 769 *  BIOCIMMEDIATE	Set immediate mode.
 770 *  BIOCVERSION		Get filter language version.
 771 *  BIOCGHDRCMPLT	Get "header already complete" flag
 772 *  BIOCSHDRCMPLT	Set "header already complete" flag
 773 */
 774int
 775bpfioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p)
 776{
 777	struct bpf_d *d;
 778	int error = 0;
 779
 780	d = bpfilter_lookup(minor(dev));
 781	if (d->bd_locked) {
 782		/* list of allowed ioctls when locked */
 783		switch (cmd) {
 784		case BIOCGBLEN:
 785		case BIOCFLUSH:
 786		case BIOCGDLT:
 787		case BIOCGDLTLIST:
 788		case BIOCGETIF:
 789		case BIOCGRTIMEOUT:
 790		case BIOCGWTIMEOUT:
 791		case BIOCGSTATS:
 792		case BIOCVERSION:
 793		case BIOCGRSIG:
 794		case BIOCGHDRCMPLT:
 795		case FIONREAD:
 796		case BIOCLOCK:
 797		case BIOCSRTIMEOUT:
 798		case BIOCSWTIMEOUT:
 799		case BIOCDWTIMEOUT:
 800		case BIOCIMMEDIATE:
 801		case TIOCGPGRP:
 802		case BIOCGDIRFILT:
 803			break;
 804		default:
 805			return (EPERM);
 806		}
 807	}
 808
 809	bpf_get(d);
 810
 811	switch (cmd) {
 812	default:
 813		error = EINVAL;
 814		break;
 815
 816	/*
 817	 * Check for read packet available.
 818	 */
 819	case FIONREAD:
 820		{
 821			int n;
 822
 823			mtx_enter(&d->bd_mtx);
 824			n = d->bd_slen;
 825			if (d->bd_hbuf != NULL)
 826				n += d->bd_hlen;
 827			mtx_leave(&d->bd_mtx);
 828
 829			*(int *)addr = n;
 830			break;
 831		}
 832
 833	/*
 834	 * Get buffer len [for read()].
 835	 */
 836	case BIOCGBLEN:
 837		*(u_int *)addr = d->bd_bufsize;
 838		break;
 839
 840	/*
 841	 * Set buffer length.
 842	 */
 843	case BIOCSBLEN:
 844		if (d->bd_bif != NULL)
 845			error = EINVAL;
 846		else {
 847			u_int size = *(u_int *)addr;
 848			int bpf_maxbufsize_local =
 849			    atomic_load_int(&bpf_maxbufsize);
 850
 851			if (size > bpf_maxbufsize_local)
 852				*(u_int *)addr = size = bpf_maxbufsize_local;
 853			else if (size < BPF_MINBUFSIZE)
 854				*(u_int *)addr = size = BPF_MINBUFSIZE;
 855			mtx_enter(&d->bd_mtx);
 856			d->bd_bufsize = size;
 857			mtx_leave(&d->bd_mtx);
 858		}
 859		break;
 860
 861	/*
 862	 * Set link layer read/write filter.
 863	 */
 864	case BIOCSETF:
 865	case BIOCSETFNR:
 866	case BIOCSETWF:
 867		error = bpf_setf(d, (struct bpf_program *)addr, cmd);
 868		break;
 869
 870	/*
 871	 * Flush read packet buffer.
 872	 */
 873	case BIOCFLUSH:
 874		mtx_enter(&d->bd_mtx);
 875		bpf_resetd(d);
 876		mtx_leave(&d->bd_mtx);
 877		break;
 878
 879	/*
 880	 * Put interface into promiscuous mode.
 881	 */
 882	case BIOCPROMISC:
 883		if (d->bd_bif == NULL) {
 884			/*
 885			 * No interface attached yet.
 886			 */
 887			error = EINVAL;
 888		} else if (d->bd_bif->bif_ifp != NULL) {
 889			if (d->bd_promisc == 0) {
 890				MUTEX_ASSERT_UNLOCKED(&d->bd_mtx);
 891				NET_LOCK();
 892				error = ifpromisc(d->bd_bif->bif_ifp, 1);
 893				NET_UNLOCK();
 894				if (error == 0)
 895					d->bd_promisc = 1;
 896			}
 897		}
 898		break;
 899
 900	/*
 901	 * Get a list of supported device parameters.
 902	 */
 903	case BIOCGDLTLIST:
 904		if (d->bd_bif == NULL)
 905			error = EINVAL;
 906		else
 907			error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
 908		break;
 909
 910	/*
 911	 * Get device parameters.
 912	 */
 913	case BIOCGDLT:
 914		if (d->bd_bif == NULL)
 915			error = EINVAL;
 916		else
 917			*(u_int *)addr = d->bd_bif->bif_dlt;
 918		break;
 919
 920	/*
 921	 * Set device parameters.
 922	 */
 923	case BIOCSDLT:
 924		if (d->bd_bif == NULL)
 925			error = EINVAL;
 926		else {
 927			mtx_enter(&d->bd_mtx);
 928			error = bpf_setdlt(d, *(u_int *)addr);
 929			mtx_leave(&d->bd_mtx);
 930		}
 931		break;
 932
 933	/*
 934	 * Set interface name.
 935	 */
 936	case BIOCGETIF:
 937		if (d->bd_bif == NULL)
 938			error = EINVAL;
 939		else
 940			bpf_ifname(d->bd_bif, (struct ifreq *)addr);
 941		break;
 942
 943	/*
 944	 * Set interface.
 945	 */
 946	case BIOCSETIF:
 947		error = bpf_setif(d, (struct ifreq *)addr);
 948		break;
 949
 950	/*
 951	 * Set read timeout.
 952	 */
 953	case BIOCSRTIMEOUT:
 954		{
 955			struct timeval *tv = (struct timeval *)addr;
 956			uint64_t rtout;
 957
 958			if (tv->tv_sec < 0 || !timerisvalid(tv)) {
 959				error = EINVAL;
 960				break;
 961			}
 962			rtout = TIMEVAL_TO_NSEC(tv);
 963			if (rtout > MAXTSLP) {
 964				error = EOVERFLOW;
 965				break;
 966			}
 967			mtx_enter(&d->bd_mtx);
 968			d->bd_rtout = rtout;
 969			mtx_leave(&d->bd_mtx);
 970			break;
 971		}
 972
 973	/*
 974	 * Get read timeout.
 975	 */
 976	case BIOCGRTIMEOUT:
 977		{
 978			struct timeval *tv = (struct timeval *)addr;
 979
 980			memset(tv, 0, sizeof(*tv));
 981			mtx_enter(&d->bd_mtx);
 982			NSEC_TO_TIMEVAL(d->bd_rtout, tv);
 983			mtx_leave(&d->bd_mtx);
 984			break;
 985		}
 986
 987	/*
 988	 * Get packet stats.
 989	 */
 990	case BIOCGSTATS:
 991		{
 992			struct bpf_stat *bs = (struct bpf_stat *)addr;
 993
 994			bs->bs_recv = d->bd_rcount;
 995			bs->bs_drop = d->bd_dcount;
 996			break;
 997		}
 998
 999	/*
1000	 * Set immediate mode.
1001	 */
1002	case BIOCIMMEDIATE:
1003		error = bpf_set_wtout(d, *(int *)addr ? 0 : INFSLP);
1004		break;
1005
1006	/*
1007	 * Wait timeout.
1008	 */
1009	case BIOCSWTIMEOUT:
1010		error = bpf_set_wtimeout(d, (const struct timeval *)addr);
1011		break;
1012	case BIOCGWTIMEOUT:
1013		error = bpf_get_wtimeout(d, (struct timeval *)addr);
1014		break;
1015	case BIOCDWTIMEOUT:
1016		error = bpf_set_wtout(d, INFSLP);
1017		break;
1018
1019	case BIOCVERSION:
1020		{
1021			struct bpf_version *bv = (struct bpf_version *)addr;
1022
1023			bv->bv_major = BPF_MAJOR_VERSION;
1024			bv->bv_minor = BPF_MINOR_VERSION;
1025			break;
1026		}
1027
1028	case BIOCGHDRCMPLT:	/* get "header already complete" flag */
1029		*(u_int *)addr = d->bd_hdrcmplt;
1030		break;
1031
1032	case BIOCSHDRCMPLT:	/* set "header already complete" flag */
1033		d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1034		break;
1035
1036	case BIOCLOCK:		/* set "locked" flag (no reset) */
1037		d->bd_locked = 1;
1038		break;
1039
1040	case BIOCGFILDROP:	/* get "filter-drop" flag */
1041		*(u_int *)addr = d->bd_fildrop;
1042		break;
1043
1044	case BIOCSFILDROP: {	/* set "filter-drop" flag */
1045		unsigned int fildrop = *(u_int *)addr;
1046		switch (fildrop) {
1047		case BPF_FILDROP_PASS:
1048		case BPF_FILDROP_CAPTURE:
1049		case BPF_FILDROP_DROP:
1050			d->bd_fildrop = fildrop;
1051			break;
1052		default:
1053			error = EINVAL;
1054			break;
1055		}
1056		break;
1057	}
1058
1059	case BIOCGDIRFILT:	/* get direction filter */
1060		*(u_int *)addr = d->bd_dirfilt;
1061		break;
1062
1063	case BIOCSDIRFILT:	/* set direction filter */
1064		d->bd_dirfilt = (*(u_int *)addr) &
1065		    (BPF_DIRECTION_IN|BPF_DIRECTION_OUT);
1066		break;
1067
1068	case FIOASYNC:		/* Send signal on receive packets */
1069		d->bd_async = *(int *)addr;
1070		break;
1071
1072	case FIOSETOWN:		/* Process or group to send signals to */
1073	case TIOCSPGRP:
1074		error = sigio_setown(&d->bd_sigio, cmd, addr);
1075		break;
1076
1077	case FIOGETOWN:
1078	case TIOCGPGRP:
1079		sigio_getown(&d->bd_sigio, cmd, addr);
1080		break;
1081
1082	case BIOCSRSIG:		/* Set receive signal */
1083		{
1084			u_int sig;
1085
1086			sig = *(u_int *)addr;
1087
1088			if (sig >= NSIG)
1089				error = EINVAL;
1090			else
1091				d->bd_sig = sig;
1092			break;
1093		}
1094	case BIOCGRSIG:
1095		*(u_int *)addr = d->bd_sig;
1096		break;
1097	}
1098
1099	bpf_put(d);
1100	return (error);
1101}
1102
1103/*
1104 * Set d's packet filter program to fp.  If this file already has a filter,
1105 * free it and replace it.  Returns EINVAL for bogus requests.
1106 */
1107int
1108bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1109{
1110	struct bpf_program_smr *bps, *old_bps;
1111	struct bpf_insn *fcode;
1112	u_int flen, size;
1113
1114	KERNEL_ASSERT_LOCKED();
1115
1116	if (fp->bf_insns == 0) {
1117		if (fp->bf_len != 0)
1118			return (EINVAL);
1119		bps = NULL;
1120	} else {
1121		flen = fp->bf_len;
1122		if (flen > BPF_MAXINSNS)
1123			return (EINVAL);
1124
1125		fcode = mallocarray(flen, sizeof(*fp->bf_insns), M_DEVBUF,
1126		    M_WAITOK | M_CANFAIL);
1127		if (fcode == NULL)
1128			return (ENOMEM);
1129
1130		size = flen * sizeof(*fp->bf_insns);
1131		if (copyin(fp->bf_insns, fcode, size) != 0 ||
1132		    bpf_validate(fcode, (int)flen) == 0) {
1133			free(fcode, M_DEVBUF, size);
1134			return (EINVAL);
1135		}
1136
1137		bps = malloc(sizeof(*bps), M_DEVBUF, M_WAITOK);
1138		smr_init(&bps->bps_smr);
1139		bps->bps_bf.bf_len = flen;
1140		bps->bps_bf.bf_insns = fcode;
1141	}
1142
1143	if (cmd != BIOCSETWF) {
1144		old_bps = SMR_PTR_GET_LOCKED(&d->bd_rfilter);
1145		SMR_PTR_SET_LOCKED(&d->bd_rfilter, bps);
1146	} else {
1147		old_bps = SMR_PTR_GET_LOCKED(&d->bd_wfilter);
1148		SMR_PTR_SET_LOCKED(&d->bd_wfilter, bps);
1149	}
1150
1151	if (cmd == BIOCSETF) {
1152		mtx_enter(&d->bd_mtx);
1153		bpf_resetd(d);
1154		mtx_leave(&d->bd_mtx);
1155	}
1156
1157	if (old_bps != NULL)
1158		smr_call(&old_bps->bps_smr, bpf_prog_smr, old_bps);
1159
1160	return (0);
1161}
1162
1163/*
1164 * Detach a file from its current interface (if attached at all) and attach
1165 * to the interface indicated by the name stored in ifr.
1166 * Return an errno or 0.
1167 */
1168int
1169bpf_setif(struct bpf_d *d, struct ifreq *ifr)
1170{
1171	struct bpf_if *bp;
1172	int error = 0;
1173
1174	/*
1175	 * Look through attached interfaces for the named one.
1176	 */
1177	TAILQ_FOREACH(bp, &bpf_iflist, bif_next) {
1178		if (strcmp(bp->bif_name, ifr->ifr_name) == 0)
1179			break;
1180	}
1181
1182	/* Not found. */
1183	if (bp == NULL)
1184		return (ENXIO);
1185
1186	/*
1187	 * Allocate the packet buffers if we need to.
1188	 * If we're already attached to requested interface,
1189	 * just flush the buffer.
1190	 */
1191	mtx_enter(&d->bd_mtx);
1192	if (d->bd_sbuf == NULL) {
1193		if ((error = bpf_allocbufs(d)))
1194			goto out;
1195	}
1196	if (bp != d->bd_bif) {
1197		/*
1198		 * Detach if attached to something else.
1199		 */
1200		bpf_detachd(d);
1201		bpf_attachd(d, bp);
1202	}
1203	bpf_resetd(d);
1204out:
1205	mtx_leave(&d->bd_mtx);
1206	return (error);
1207}
1208
1209/*
1210 * Copy the interface name to the ifreq.
1211 */
1212void
1213bpf_ifname(struct bpf_if *bif, struct ifreq *ifr)
1214{
1215	bcopy(bif->bif_name, ifr->ifr_name, sizeof(ifr->ifr_name));
1216}
1217
1218const struct filterops bpfread_filtops = {
1219	.f_flags	= FILTEROP_ISFD | FILTEROP_MPSAFE,
1220	.f_attach	= NULL,
1221	.f_detach	= filt_bpfrdetach,
1222	.f_event	= filt_bpfread,
1223	.f_modify	= filt_bpfreadmodify,
1224	.f_process	= filt_bpfreadprocess,
1225};
1226
1227int
1228bpfkqfilter(dev_t dev, struct knote *kn)
1229{
1230	struct bpf_d *d;
1231	struct klist *klist;
1232
1233	KERNEL_ASSERT_LOCKED();
1234
1235	d = bpfilter_lookup(minor(dev));
1236	if (d == NULL)
1237		return (ENXIO);
1238
1239	switch (kn->kn_filter) {
1240	case EVFILT_READ:
1241		klist = &d->bd_klist;
1242		kn->kn_fop = &bpfread_filtops;
1243		break;
1244	default:
1245		return (EINVAL);
1246	}
1247
1248	bpf_get(d);
1249	kn->kn_hook = d;
1250	klist_insert(klist, kn);
1251
1252	return (0);
1253}
1254
1255void
1256filt_bpfrdetach(struct knote *kn)
1257{
1258	struct bpf_d *d = kn->kn_hook;
1259
1260	klist_remove(&d->bd_klist, kn);
1261	bpf_put(d);
1262}
1263
1264int
1265filt_bpfread(struct knote *kn, long hint)
1266{
1267	struct bpf_d *d = kn->kn_hook;
1268
1269	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1270
1271	kn->kn_data = d->bd_hlen;
1272	if (d->bd_state == BPF_S_DONE)
1273		kn->kn_data += d->bd_slen;
1274
1275	return (kn->kn_data > 0);
1276}
1277
1278int
1279filt_bpfreadmodify(struct kevent *kev, struct knote *kn)
1280{
1281	struct bpf_d *d = kn->kn_hook;
1282	int active;
1283
1284	mtx_enter(&d->bd_mtx);
1285	active = knote_modify_fn(kev, kn, filt_bpfread);
1286	mtx_leave(&d->bd_mtx);
1287
1288	return (active);
1289}
1290
1291int
1292filt_bpfreadprocess(struct knote *kn, struct kevent *kev)
1293{
1294	struct bpf_d *d = kn->kn_hook;
1295	int active;
1296
1297	mtx_enter(&d->bd_mtx);
1298	active = knote_process_fn(kn, kev, filt_bpfread);
1299	mtx_leave(&d->bd_mtx);
1300
1301	return (active);
1302}
1303
1304/*
1305 * Copy data from an mbuf chain into a buffer.  This code is derived
1306 * from m_copydata in sys/uipc_mbuf.c.
1307 */
1308void
1309bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
1310{
1311	const struct mbuf *m;
1312	u_int count;
1313	u_char *dst;
1314
1315	m = src_arg;
1316	dst = dst_arg;
1317	while (len > 0) {
1318		if (m == NULL)
1319			panic("bpf_mcopy");
1320		count = min(m->m_len, len);
1321		bcopy(mtod(m, caddr_t), (caddr_t)dst, count);
1322		m = m->m_next;
1323		dst += count;
1324		len -= count;
1325	}
1326}
1327
1328int
1329bpf_mtap(caddr_t arg, const struct mbuf *m, u_int direction)
1330{
1331	return _bpf_mtap(arg, m, m, direction);
1332}
1333
1334int
1335_bpf_mtap(caddr_t arg, const struct mbuf *mp, const struct mbuf *m,
1336    u_int direction)
1337{
1338	struct bpf_if *bp = (struct bpf_if *)arg;
1339	struct bpf_d *d;
1340	size_t pktlen, slen;
1341	const struct mbuf *m0;
1342	struct bpf_hdr tbh;
1343	int gothdr = 0;
1344	int drop = 0;
1345
1346	if (m == NULL)
1347		return (0);
1348
1349	if (bp == NULL)
1350		return (0);
1351
1352	pktlen = 0;
1353	for (m0 = m; m0 != NULL; m0 = m0->m_next)
1354		pktlen += m0->m_len;
1355
1356	smr_read_enter();
1357	SMR_SLIST_FOREACH(d, &bp->bif_dlist, bd_next) {
1358		struct bpf_program_smr *bps;
1359		struct bpf_insn *fcode = NULL;
1360
1361		atomic_inc_long(&d->bd_rcount);
1362
1363		if (ISSET(d->bd_dirfilt, direction))
1364			continue;
1365
1366		bps = SMR_PTR_GET(&d->bd_rfilter);
1367		if (bps != NULL)
1368			fcode = bps->bps_bf.bf_insns;
1369		slen = bpf_mfilter(fcode, m, pktlen);
1370
1371		if (slen == 0)
1372			continue;
1373		if (d->bd_fildrop != BPF_FILDROP_PASS)
1374			drop = 1;
1375		if (d->bd_fildrop != BPF_FILDROP_DROP) {
1376			if (!gothdr) {
1377				struct timeval tv;
1378				memset(&tbh, 0, sizeof(tbh));
1379
1380				if (ISSET(mp->m_flags, M_PKTHDR)) {
1381					tbh.bh_ifidx = mp->m_pkthdr.ph_ifidx;
1382					tbh.bh_flowid = mp->m_pkthdr.ph_flowid;
1383					tbh.bh_flags = mp->m_pkthdr.pf.prio;
1384					if (ISSET(mp->m_pkthdr.csum_flags,
1385					    M_FLOWID))
1386						SET(tbh.bh_flags, BPF_F_FLOWID);
1387					tbh.bh_csumflags =
1388					    mp->m_pkthdr.csum_flags;
1389
1390					m_microtime(mp, &tv);
1391				} else
1392					microtime(&tv);
1393
1394				tbh.bh_tstamp.tv_sec = tv.tv_sec;
1395				tbh.bh_tstamp.tv_usec = tv.tv_usec;
1396				SET(tbh.bh_flags, direction << BPF_F_DIR_SHIFT);
1397
1398				gothdr = 1;
1399			}
1400
1401			mtx_enter(&d->bd_mtx);
1402			bpf_catchpacket(d, (u_char *)m, pktlen, slen, &tbh);
1403			mtx_leave(&d->bd_mtx);
1404		}
1405	}
1406	smr_read_leave();
1407
1408	return (drop);
1409}
1410
1411/*
1412 * Incoming linkage from device drivers, where a data buffer should be
1413 * prepended by an arbitrary header. In this situation we already have a
1414 * way of representing a chain of memory buffers, ie, mbufs, so reuse
1415 * the existing functionality by attaching the buffers to mbufs.
1416 *
1417 * Con up a minimal mbuf chain to pacify bpf by allocating (only) a
1418 * struct m_hdr each for the header and data on the stack.
1419 */
1420int
1421bpf_tap_hdr(caddr_t arg, const void *hdr, unsigned int hdrlen,
1422    const void *buf, unsigned int buflen, u_int direction)
1423{
1424	struct m_hdr mh, md;
1425	struct mbuf *m0 = NULL;
1426	struct mbuf **mp = &m0;
1427
1428	if (hdr != NULL) {
1429		mh.mh_flags = 0;
1430		mh.mh_next = NULL;
1431		mh.mh_len = hdrlen;
1432		mh.mh_data = (void *)hdr;
1433
1434		*mp = (struct mbuf *)&mh;
1435		mp = &mh.mh_next;
1436	}
1437
1438	if (buf != NULL) {
1439		md.mh_flags = 0;
1440		md.mh_next = NULL;
1441		md.mh_len = buflen;
1442		md.mh_data = (void *)buf;
1443
1444		*mp = (struct mbuf *)&md;
1445	}
1446
1447	return bpf_mtap(arg, m0, direction);
1448}
1449
1450/*
1451 * Incoming linkage from device drivers, where we have a mbuf chain
1452 * but need to prepend some arbitrary header from a linear buffer.
1453 *
1454 * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1455 * struct m_hdr on the stack.  This is safe as bpf only reads from the
1456 * fields in this header that we initialize, and will not try to free
1457 * it or keep a pointer to it.
1458 */
1459int
1460bpf_mtap_hdr(caddr_t arg, const void *data, u_int dlen, const struct mbuf *m,
1461    u_int direction)
1462{
1463	struct m_hdr mh;
1464	const struct mbuf *m0;
1465
1466	if (dlen > 0) {
1467		mh.mh_flags = 0;
1468		mh.mh_next = (struct mbuf *)m;
1469		mh.mh_len = dlen;
1470		mh.mh_data = (void *)data;
1471		m0 = (struct mbuf *)&mh;
1472	} else
1473		m0 = m;
1474
1475	return _bpf_mtap(arg, m, m0, direction);
1476}
1477
1478/*
1479 * Incoming linkage from device drivers, where we have a mbuf chain
1480 * but need to prepend the address family.
1481 *
1482 * Con up a minimal dummy header to pacify bpf.  We allocate (only) a
1483 * struct m_hdr on the stack.  This is safe as bpf only reads from the
1484 * fields in this header that we initialize, and will not try to free
1485 * it or keep a pointer to it.
1486 */
1487int
1488bpf_mtap_af(caddr_t arg, u_int32_t af, const struct mbuf *m, u_int direction)
1489{
1490	u_int32_t    afh;
1491
1492	afh = htonl(af);
1493
1494	return bpf_mtap_hdr(arg, &afh, sizeof(afh), m, direction);
1495}
1496
1497/*
1498 * Incoming linkage from device drivers, where we have a mbuf chain
1499 * but need to prepend a VLAN encapsulation header.
1500 *
1501 * Con up a minimal dummy header to pacify bpf.  Allocate (only) a
1502 * struct m_hdr on the stack.  This is safe as bpf only reads from the
1503 * fields in this header that we initialize, and will not try to free
1504 * it or keep a pointer to it.
1505 */
1506int
1507bpf_mtap_ether(caddr_t arg, const struct mbuf *m, u_int direction)
1508{
1509#if NVLAN > 0
1510	struct ether_vlan_header evh;
1511	struct m_hdr mh, md;
1512
1513	if ((m->m_flags & M_VLANTAG) == 0)
1514#endif
1515	{
1516		return _bpf_mtap(arg, m, m, direction);
1517	}
1518
1519#if NVLAN > 0
1520	KASSERT(m->m_len >= ETHER_HDR_LEN);
1521
1522	memcpy(&evh, mtod(m, char *), ETHER_HDR_LEN);
1523	evh.evl_proto = evh.evl_encap_proto;
1524	evh.evl_encap_proto = htons(ETHERTYPE_VLAN);
1525	evh.evl_tag = htons(m->m_pkthdr.ether_vtag);
1526
1527	mh.mh_flags = 0;
1528	mh.mh_data = (caddr_t)&evh;
1529	mh.mh_len = sizeof(evh);
1530	mh.mh_next = (struct mbuf *)&md;
1531
1532	md.mh_flags = 0;
1533	md.mh_data = m->m_data + ETHER_HDR_LEN;
1534	md.mh_len = m->m_len - ETHER_HDR_LEN;
1535	md.mh_next = m->m_next;
1536
1537	return _bpf_mtap(arg, m, (struct mbuf *)&mh, direction);
1538#endif
1539}
1540
1541/*
1542 * Move the packet data from interface memory (pkt) into the
1543 * store buffer.  Wake up listeners if needed.
1544 * "copy" is the routine called to do the actual data
1545 * transfer.  bcopy is passed in to copy contiguous chunks, while
1546 * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
1547 * pkt is really an mbuf.
1548 */
1549void
1550bpf_catchpacket(struct bpf_d *d, u_char *pkt, size_t pktlen, size_t snaplen,
1551    const struct bpf_hdr *tbh)
1552{
1553	struct bpf_hdr *bh;
1554	int totlen, curlen;
1555	int hdrlen, do_wakeup = 0;
1556
1557	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1558	if (d->bd_bif == NULL)
1559		return;
1560
1561	hdrlen = d->bd_bif->bif_hdrlen;
1562
1563	/*
1564	 * Figure out how many bytes to move.  If the packet is
1565	 * greater or equal to the snapshot length, transfer that
1566	 * much.  Otherwise, transfer the whole packet (unless
1567	 * we hit the buffer size limit).
1568	 */
1569	totlen = hdrlen + min(snaplen, pktlen);
1570	if (totlen > d->bd_bufsize)
1571		totlen = d->bd_bufsize;
1572
1573	/*
1574	 * Round up the end of the previous packet to the next longword.
1575	 */
1576	curlen = BPF_WORDALIGN(d->bd_slen);
1577	if (curlen + totlen > d->bd_bufsize) {
1578		/*
1579		 * This packet will overflow the storage buffer.
1580		 * Rotate the buffers if we can, then wakeup any
1581		 * pending reads.
1582		 */
1583		if (d->bd_fbuf == NULL) {
1584			/*
1585			 * We haven't completed the previous read yet,
1586			 * so drop the packet.
1587			 */
1588			++d->bd_dcount;
1589			return;
1590		}
1591
1592		/* cancel pending wtime */
1593		if (timeout_del(&d->bd_wait_tmo))
1594			bpf_put(d);
1595
1596		ROTATE_BUFFERS(d);
1597		do_wakeup = 1;
1598		curlen = 0;
1599	}
1600
1601	/*
1602	 * Append the bpf header.
1603	 */
1604	bh = (struct bpf_hdr *)(d->bd_sbuf + curlen);
1605	*bh = *tbh;
1606	bh->bh_datalen = pktlen;
1607	bh->bh_hdrlen = hdrlen;
1608	bh->bh_caplen = totlen - hdrlen;
1609
1610	/*
1611	 * Copy the packet data into the store buffer and update its length.
1612	 */
1613	bpf_mcopy(pkt, (u_char *)bh + hdrlen, bh->bh_caplen);
1614	d->bd_slen = curlen + totlen;
1615
1616	switch (d->bd_wtout) {
1617	case 0:
1618		/*
1619		 * Immediate mode is set.  A packet arrived so any
1620		 * reads should be woken up.
1621		 */
1622		if (d->bd_state == BPF_S_IDLE)
1623			d->bd_state = BPF_S_DONE;
1624		do_wakeup = 1;
1625		break;
1626	case INFSLP:
1627		break;
1628	default:
1629		if (d->bd_state == BPF_S_IDLE) {
1630			d->bd_state = BPF_S_WAIT;
1631
1632			bpf_get(d);
1633			if (!timeout_add_nsec(&d->bd_wait_tmo, d->bd_wtout))
1634				bpf_put(d);
1635		}
1636		break;
1637	}
1638
1639	if (do_wakeup)
1640		bpf_wakeup(d);
1641}
1642
1643/*
1644 * Initialize all nonzero fields of a descriptor.
1645 */
1646int
1647bpf_allocbufs(struct bpf_d *d)
1648{
1649	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1650
1651	d->bd_fbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1652	if (d->bd_fbuf == NULL)
1653		return (ENOMEM);
1654
1655	d->bd_sbuf = malloc(d->bd_bufsize, M_DEVBUF, M_NOWAIT);
1656	if (d->bd_sbuf == NULL) {
1657		free(d->bd_fbuf, M_DEVBUF, d->bd_bufsize);
1658		d->bd_fbuf = NULL;
1659		return (ENOMEM);
1660	}
1661
1662	d->bd_slen = 0;
1663	d->bd_hlen = 0;
1664
1665	return (0);
1666}
1667
1668void
1669bpf_prog_smr(void *bps_arg)
1670{
1671	struct bpf_program_smr *bps = bps_arg;
1672
1673	free(bps->bps_bf.bf_insns, M_DEVBUF,
1674	    bps->bps_bf.bf_len * sizeof(struct bpf_insn));
1675	free(bps, M_DEVBUF, sizeof(struct bpf_program_smr));
1676}
1677
1678void
1679bpf_d_smr(void *smr)
1680{
1681	struct bpf_d	*bd = smr;
1682
1683	sigio_free(&bd->bd_sigio);
1684	free(bd->bd_sbuf, M_DEVBUF, bd->bd_bufsize);
1685	free(bd->bd_hbuf, M_DEVBUF, bd->bd_bufsize);
1686	free(bd->bd_fbuf, M_DEVBUF, bd->bd_bufsize);
1687
1688	if (bd->bd_rfilter != NULL)
1689		bpf_prog_smr(bd->bd_rfilter);
1690	if (bd->bd_wfilter != NULL)
1691		bpf_prog_smr(bd->bd_wfilter);
1692
1693	klist_free(&bd->bd_klist);
1694	free(bd, M_DEVBUF, sizeof(*bd));
1695}
1696
1697void
1698bpf_get(struct bpf_d *bd)
1699{
1700	refcnt_take(&bd->bd_refcnt);
1701}
1702
1703/*
1704 * Free buffers currently in use by a descriptor
1705 * when the reference count drops to zero.
1706 */
1707void
1708bpf_put(struct bpf_d *bd)
1709{
1710	if (refcnt_rele(&bd->bd_refcnt) == 0)
1711		return;
1712
1713	smr_call(&bd->bd_smr, bpf_d_smr, bd);
1714}
1715
1716void *
1717bpfsattach(caddr_t *bpfp, const char *name, u_int dlt, u_int hdrlen)
1718{
1719	struct bpf_if *bp;
1720
1721	if ((bp = malloc(sizeof(*bp), M_DEVBUF, M_NOWAIT)) == NULL)
1722		panic("bpfattach");
1723	SMR_SLIST_INIT(&bp->bif_dlist);
1724	bp->bif_driverp = (struct bpf_if **)bpfp;
1725	bp->bif_name = name;
1726	bp->bif_ifp = NULL;
1727	bp->bif_dlt = dlt;
1728
1729	TAILQ_INSERT_TAIL(&bpf_iflist, bp, bif_next);
1730
1731	*bp->bif_driverp = NULL;
1732
1733	/*
1734	 * Compute the length of the bpf header.  This is not necessarily
1735	 * equal to SIZEOF_BPF_HDR because we want to insert spacing such
1736	 * that the network layer header begins on a longword boundary (for
1737	 * performance reasons and to alleviate alignment restrictions).
1738	 */
1739	bp->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
1740
1741	return (bp);
1742}
1743
1744void *
1745bpfxattach(caddr_t *driverp, const char *name, struct ifnet *ifp,
1746    u_int dlt, u_int hdrlen)
1747{
1748	struct bpf_if *bp;
1749
1750	bp = bpfsattach(driverp, name, dlt, hdrlen);
1751	bp->bif_ifp = ifp;
1752
1753	return (bp);
1754}
1755
1756void
1757bpfattach(caddr_t *driverp, struct ifnet *ifp, u_int dlt, u_int hdrlen)
1758{
1759	bpfxattach(driverp, ifp->if_xname, ifp, dlt, hdrlen);
1760}
1761
1762/* Detach an interface from its attached bpf device.  */
1763void
1764bpfdetach(struct ifnet *ifp)
1765{
1766	struct bpf_if *bp, *nbp;
1767
1768	KERNEL_ASSERT_LOCKED();
1769
1770	TAILQ_FOREACH_SAFE(bp, &bpf_iflist, bif_next, nbp) {
1771		if (bp->bif_ifp == ifp)
1772			bpfsdetach(bp);
1773	}
1774	ifp->if_bpf = NULL;
1775}
1776
1777void
1778bpfsdetach(void *p)
1779{
1780	struct bpf_if *bp = p;
1781	struct bpf_d *bd;
1782	int maj;
1783
1784	KERNEL_ASSERT_LOCKED();
1785
1786	/* Locate the major number. */
1787	for (maj = 0; maj < nchrdev; maj++)
1788		if (cdevsw[maj].d_open == bpfopen)
1789			break;
1790
1791	while ((bd = SMR_SLIST_FIRST_LOCKED(&bp->bif_dlist))) {
1792		bpf_get(bd);
1793		vdevgone(maj, bd->bd_unit, bd->bd_unit, VCHR);
1794		klist_invalidate(&bd->bd_klist);
1795		bpf_put(bd);
1796	}
1797
1798	TAILQ_REMOVE(&bpf_iflist, bp, bif_next);
1799
1800	free(bp, M_DEVBUF, sizeof(*bp));
1801}
1802
1803#ifndef SMALL_KERNEL
1804int
1805bpf_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
1806    size_t newlen)
1807{
1808	if (namelen != 1)
1809		return (ENOTDIR);
1810
1811	switch (name[0]) {
1812	case NET_BPF_BUFSIZE:
1813		return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1814		    &bpf_bufsize, BPF_MINBUFSIZE,
1815		    atomic_load_int(&bpf_maxbufsize));
1816	case NET_BPF_MAXBUFSIZE:
1817		return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
1818		    &bpf_maxbufsize, BPF_MINBUFSIZE, MALLOC_MAX);
1819	default:
1820		return (EOPNOTSUPP);
1821	}
1822
1823	/* NOTREACHED */
1824}
1825#endif /* SMALL_KERNEL */
1826
1827struct bpf_d *
1828bpfilter_lookup(int unit)
1829{
1830	struct bpf_d *bd;
1831
1832	KERNEL_ASSERT_LOCKED();
1833
1834	LIST_FOREACH(bd, &bpf_d_list, bd_list)
1835		if (bd->bd_unit == unit)
1836			return (bd);
1837	return (NULL);
1838}
1839
1840/*
1841 * Get a list of available data link type of the interface.
1842 */
1843int
1844bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
1845{
1846	int n, error;
1847	struct bpf_if *bp;
1848	const char *name;
1849
1850	name = d->bd_bif->bif_name;
1851	n = 0;
1852	error = 0;
1853	TAILQ_FOREACH(bp, &bpf_iflist, bif_next) {
1854		if (strcmp(name, bp->bif_name) != 0)
1855			continue;
1856		if (bfl->bfl_list != NULL) {
1857			if (n >= bfl->bfl_len)
1858				return (ENOMEM);
1859			error = copyout(&bp->bif_dlt,
1860			    bfl->bfl_list + n, sizeof(u_int));
1861			if (error)
1862				break;
1863		}
1864		n++;
1865	}
1866
1867	bfl->bfl_len = n;
1868	return (error);
1869}
1870
1871/*
1872 * Set the data link type of a BPF instance.
1873 */
1874int
1875bpf_setdlt(struct bpf_d *d, u_int dlt)
1876{
1877	const char *name;
1878	struct bpf_if *bp;
1879
1880	MUTEX_ASSERT_LOCKED(&d->bd_mtx);
1881	if (d->bd_bif->bif_dlt == dlt)
1882		return (0);
1883	name = d->bd_bif->bif_name;
1884	TAILQ_FOREACH(bp, &bpf_iflist, bif_next) {
1885		if (strcmp(name, bp->bif_name) != 0)
1886			continue;
1887		if (bp->bif_dlt == dlt)
1888			break;
1889	}
1890	if (bp == NULL)
1891		return (EINVAL);
1892	bpf_detachd(d);
1893	bpf_attachd(d, bp);
1894	bpf_resetd(d);
1895	return (0);
1896}
1897
1898u_int32_t	bpf_mbuf_ldw(const void *, u_int32_t, int *);
1899u_int32_t	bpf_mbuf_ldh(const void *, u_int32_t, int *);
1900u_int32_t	bpf_mbuf_ldb(const void *, u_int32_t, int *);
1901
1902int		bpf_mbuf_copy(const struct mbuf *, u_int32_t,
1903		    void *, u_int32_t);
1904
1905const struct bpf_ops bpf_mbuf_ops = {
1906	bpf_mbuf_ldw,
1907	bpf_mbuf_ldh,
1908	bpf_mbuf_ldb,
1909};
1910
1911int
1912bpf_mbuf_copy(const struct mbuf *m, u_int32_t off, void *buf, u_int32_t len)
1913{
1914	u_int8_t *cp = buf;
1915	u_int32_t count;
1916
1917	while (off >= m->m_len) {
1918		off -= m->m_len;
1919
1920		m = m->m_next;
1921		if (m == NULL)
1922			return (-1);
1923	}
1924
1925	for (;;) {
1926		count = min(m->m_len - off, len);
1927
1928		memcpy(cp, m->m_data + off, count);
1929		len -= count;
1930
1931		if (len == 0)
1932			return (0);
1933
1934		m = m->m_next;
1935		if (m == NULL)
1936			break;
1937
1938		cp += count;
1939		off = 0;
1940	}
1941
1942	return (-1);
1943}
1944
1945u_int32_t
1946bpf_mbuf_ldw(const void *m0, u_int32_t k, int *err)
1947{
1948	u_int32_t v;
1949
1950	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1951		*err = 1;
1952		return (0);
1953	}
1954
1955	*err = 0;
1956	return ntohl(v);
1957}
1958
1959u_int32_t
1960bpf_mbuf_ldh(const void *m0, u_int32_t k, int *err)
1961{
1962	u_int16_t v;
1963
1964	if (bpf_mbuf_copy(m0, k, &v, sizeof(v)) != 0) {
1965		*err = 1;
1966		return (0);
1967	}
1968
1969	*err = 0;
1970	return ntohs(v);
1971}
1972
1973u_int32_t
1974bpf_mbuf_ldb(const void *m0, u_int32_t k, int *err)
1975{
1976	const struct mbuf *m = m0;
1977	u_int8_t v;
1978
1979	while (k >= m->m_len) {
1980		k -= m->m_len;
1981
1982		m = m->m_next;
1983		if (m == NULL) {
1984			*err = 1;
1985			return (0);
1986		}
1987	}
1988	v = m->m_data[k];
1989
1990	*err = 0;
1991	return v;
1992}
1993
1994u_int
1995bpf_mfilter(const struct bpf_insn *pc, const struct mbuf *m, u_int wirelen)
1996{
1997	return _bpf_filter(pc, &bpf_mbuf_ops, m, wirelen);
1998}