sys/net/bpf.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1990, 1991, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * Copyright (c) 2019 Andrey V. Elsukov <ae@FreeBSD.org>
   7  *
   8  * This code is derived from the Stanford/CMU enet packet filter,
   9  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  10  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  11  * Berkeley Laboratory.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  * 3. Neither the name of the University nor the names of its contributors
  22  *    may be used to endorse or promote products derived from this software
  23  *    without specific prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35  * SUCH DAMAGE.
  36  *
  37  *      @(#)bpf.c       8.4 (Berkeley) 1/9/95
  38  */
  39
  40 #include <sys/cdefs.h>
  41 __FBSDID("$FreeBSD$");
  42
  43 #include "opt_bpf.h"
  44 #include "opt_ddb.h"
  45 #include "opt_netgraph.h"
  46
  47 #include <sys/param.h>
  48 #include <sys/conf.h>
  49 #include <sys/eventhandler.h>
  50 #include <sys/fcntl.h>
  51 #include <sys/jail.h>
  52 #include <sys/ktr.h>
  53 #include <sys/lock.h>
  54 #include <sys/malloc.h>
  55 #include <sys/mbuf.h>
  56 #include <sys/mutex.h>
  57 #include <sys/time.h>
  58 #include <sys/priv.h>
  59 #include <sys/proc.h>
  60 #include <sys/signalvar.h>
  61 #include <sys/filio.h>
  62 #include <sys/sockio.h>
  63 #include <sys/ttycom.h>
  64 #include <sys/uio.h>
  65 #include <sys/sysent.h>
  66 #include <sys/systm.h>
  67
  68 #include <sys/event.h>
  69 #include <sys/file.h>
  70 #include <sys/poll.h>
  71 #include <sys/proc.h>
  72
  73 #include <sys/socket.h>
  74
  75 #ifdef DDB
  76 #include <ddb/ddb.h>
  77 #endif
  78
  79 #include <net/if.h>
  80 #include <net/if_var.h>
  81 #include <net/if_private.h>
  82 #include <net/if_vlan_var.h>
  83 #include <net/if_dl.h>
  84 #include <net/bpf.h>
  85 #include <net/bpf_buffer.h>
  86 #ifdef BPF_JITTER
  87 #include <net/bpf_jitter.h>
  88 #endif
  89 #include <net/bpf_zerocopy.h>
  90 #include <net/bpfdesc.h>
  91 #include <net/route.h>
  92 #include <net/vnet.h>
  93
  94 #include <netinet/in.h>
  95 #include <netinet/if_ether.h>
  96 #include <sys/kernel.h>
  97 #include <sys/sysctl.h>
  98
  99 #include <net80211/ieee80211_freebsd.h>
 100
 101 #include <security/mac/mac_framework.h>
 102
 103 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 104
 105 static struct bpf_if_ext dead_bpf_if = {
 106         .bif_dlist = CK_LIST_HEAD_INITIALIZER()
 107 };
 108
 109 struct bpf_if {
 110 #define bif_next        bif_ext.bif_next
 111 #define bif_dlist       bif_ext.bif_dlist
 112         struct bpf_if_ext bif_ext;      /* public members */
 113         u_int           bif_dlt;        /* link layer type */
 114         u_int           bif_hdrlen;     /* length of link header */
 115         struct bpfd_list bif_wlist;     /* writer-only list */
 116         struct ifnet    *bif_ifp;       /* corresponding interface */
 117         struct bpf_if   **bif_bpf;      /* Pointer to pointer to us */
 118         volatile u_int  bif_refcnt;
 119         struct epoch_context epoch_ctx;
 120 };
 121
 122 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
 123
 124 struct bpf_program_buffer {
 125         struct epoch_context    epoch_ctx;
 126 #ifdef BPF_JITTER
 127         bpf_jit_filter          *func;
 128 #endif
 129         void                    *buffer[0];
 130 };
 131
 132 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 133
 134 #define PRINET  26                      /* interruptible */
 135 #define BPF_PRIO_MAX    7
 136
 137 #define SIZEOF_BPF_HDR(type)    \
 138     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
 139
 140 #ifdef COMPAT_FREEBSD32
 141 #include <sys/mount.h>
 142 #include <compat/freebsd32/freebsd32.h>
 143 #define BPF_ALIGNMENT32 sizeof(int32_t)
 144 #define BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32)
 145
 146 #ifndef BURN_BRIDGES
 147 /*
 148  * 32-bit version of structure prepended to each packet.  We use this header
 149  * instead of the standard one for 32-bit streams.  We mark the a stream as
 150  * 32-bit the first time we see a 32-bit compat ioctl request.
 151  */
 152 struct bpf_hdr32 {
 153         struct timeval32 bh_tstamp;     /* time stamp */
 154         uint32_t        bh_caplen;      /* length of captured portion */
 155         uint32_t        bh_datalen;     /* original length of packet */
 156         uint16_t        bh_hdrlen;      /* length of bpf header (this struct
 157                                            plus alignment padding) */
 158 };
 159 #endif
 160
 161 struct bpf_program32 {
 162         u_int bf_len;
 163         uint32_t bf_insns;
 164 };
 165
 166 struct bpf_dltlist32 {
 167         u_int   bfl_len;
 168         u_int   bfl_list;
 169 };
 170
 171 #define BIOCSETF32      _IOW('B', 103, struct bpf_program32)
 172 #define BIOCSRTIMEOUT32 _IOW('B', 109, struct timeval32)
 173 #define BIOCGRTIMEOUT32 _IOR('B', 110, struct timeval32)
 174 #define BIOCGDLTLIST32  _IOWR('B', 121, struct bpf_dltlist32)
 175 #define BIOCSETWF32     _IOW('B', 123, struct bpf_program32)
 176 #define BIOCSETFNR32    _IOW('B', 130, struct bpf_program32)
 177 #endif
 178
 179 #define BPF_LOCK()         sx_xlock(&bpf_sx)
 180 #define BPF_UNLOCK()            sx_xunlock(&bpf_sx)
 181 #define BPF_LOCK_ASSERT()       sx_assert(&bpf_sx, SA_XLOCKED)
 182 /*
 183  * bpf_iflist is a list of BPF interface structures, each corresponding to a
 184  * specific DLT. The same network interface might have several BPF interface
 185  * structures registered by different layers in the stack (i.e., 802.11
 186  * frames, ethernet frames, etc).
 187  */
 188 CK_LIST_HEAD(bpf_iflist, bpf_if);
 189 static struct bpf_iflist bpf_iflist;
 190 static struct sx        bpf_sx;         /* bpf global lock */
 191 static int              bpf_bpfd_cnt;
 192
 193 static void     bpfif_ref(struct bpf_if *);
 194 static void     bpfif_rele(struct bpf_if *);
 195
 196 static void     bpfd_ref(struct bpf_d *);
 197 static void     bpfd_rele(struct bpf_d *);
 198 static void     bpf_attachd(struct bpf_d *, struct bpf_if *);
 199 static void     bpf_detachd(struct bpf_d *);
 200 static void     bpf_detachd_locked(struct bpf_d *, bool);
 201 static void     bpfd_free(epoch_context_t);
 202 static int      bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 203                     struct sockaddr *, int *, struct bpf_d *);
 204 static int      bpf_setif(struct bpf_d *, struct ifreq *);
 205 static void     bpf_timed_out(void *);
 206 static __inline void
 207                 bpf_wakeup(struct bpf_d *);
 208 static void     catchpacket(struct bpf_d *, u_char *, u_int, u_int,
 209                     void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 210                     struct bintime *);
 211 static void     reset_d(struct bpf_d *);
 212 static int      bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
 213 static int      bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 214 static int      bpf_setdlt(struct bpf_d *, u_int);
 215 static void     filt_bpfdetach(struct knote *);
 216 static int      filt_bpfread(struct knote *, long);
 217 static int      filt_bpfwrite(struct knote *, long);
 218 static void     bpf_drvinit(void *);
 219 static int      bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
 220
 221 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 222     "bpf sysctl");
 223 int bpf_maxinsns = BPF_MAXINSNS;
 224 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
 225     &bpf_maxinsns, 0, "Maximum bpf program instructions");
 226 static int bpf_zerocopy_enable = 0;
 227 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
 228     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
 229 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
 230     bpf_stats_sysctl, "bpf statistics portal");
 231
 232 VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0;
 233 #define V_bpf_optimize_writers VNET(bpf_optimize_writers)
 234 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RWTUN,
 235     &VNET_NAME(bpf_optimize_writers), 0,
 236     "Do not send packets until BPF program is set");
 237
 238 static  d_open_t        bpfopen;
 239 static  d_read_t        bpfread;
 240 static  d_write_t       bpfwrite;
 241 static  d_ioctl_t       bpfioctl;
 242 static  d_poll_t        bpfpoll;
 243 static  d_kqfilter_t    bpfkqfilter;
 244
 245 static struct cdevsw bpf_cdevsw = {
 246         .d_version =    D_VERSION,
 247         .d_open =       bpfopen,
 248         .d_read =       bpfread,
 249         .d_write =      bpfwrite,
 250         .d_ioctl =      bpfioctl,
 251         .d_poll =       bpfpoll,
 252         .d_name =       "bpf",
 253         .d_kqfilter =   bpfkqfilter,
 254 };
 255
 256 static struct filterops bpfread_filtops = {
 257         .f_isfd = 1,
 258         .f_detach = filt_bpfdetach,
 259         .f_event = filt_bpfread,
 260 };
 261
 262 static struct filterops bpfwrite_filtops = {
 263         .f_isfd = 1,
 264         .f_detach = filt_bpfdetach,
 265         .f_event = filt_bpfwrite,
 266 };
 267
 268 /*
 269  * LOCKING MODEL USED BY BPF
 270  *
 271  * Locks:
 272  * 1) global lock (BPF_LOCK). Sx, used to protect some global counters,
 273  * every bpf_iflist changes, serializes ioctl access to bpf descriptors.
 274  * 2) Descriptor lock. Mutex, used to protect BPF buffers and various
 275  * structure fields used by bpf_*tap* code.
 276  *
 277  * Lock order: global lock, then descriptor lock.
 278  *
 279  * There are several possible consumers:
 280  *
 281  * 1. The kernel registers interface pointer with bpfattach().
 282  * Each call allocates new bpf_if structure, references ifnet pointer
 283  * and links bpf_if into bpf_iflist chain. This is protected with global
 284  * lock.
 285  *
 286  * 2. An userland application uses ioctl() call to bpf_d descriptor.
 287  * All such call are serialized with global lock. BPF filters can be
 288  * changed, but pointer to old filter will be freed using NET_EPOCH_CALL().
 289  * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to
 290  * filter pointers, even if change will happen during bpf_tap execution.
 291  * Destroying of bpf_d descriptor also is doing using NET_EPOCH_CALL().
 292  *
 293  * 3. An userland application can write packets into bpf_d descriptor.
 294  * There we need to be sure, that ifnet won't disappear during bpfwrite().
 295  *
 296  * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to
 297  * bif_dlist is protected with net_epoch_preempt section. So, it should
 298  * be safe to make access to bpf_d descriptor inside the section.
 299  *
 300  * 5. The kernel invokes bpfdetach() on interface destroying. All lists
 301  * are modified with global lock held and actual free() is done using
 302  * NET_EPOCH_CALL().
 303  */
 304
 305 static void
 306 bpfif_free(epoch_context_t ctx)
 307 {
 308         struct bpf_if *bp;
 309
 310         bp = __containerof(ctx, struct bpf_if, epoch_ctx);
 311         if_rele(bp->bif_ifp);
 312         free(bp, M_BPF);
 313 }
 314
 315 static void
 316 bpfif_ref(struct bpf_if *bp)
 317 {
 318
 319         refcount_acquire(&bp->bif_refcnt);
 320 }
 321
 322 static void
 323 bpfif_rele(struct bpf_if *bp)
 324 {
 325
 326         if (!refcount_release(&bp->bif_refcnt))
 327                 return;
 328         NET_EPOCH_CALL(bpfif_free, &bp->epoch_ctx);
 329 }
 330
 331 static void
 332 bpfd_ref(struct bpf_d *d)
 333 {
 334
 335         refcount_acquire(&d->bd_refcnt);
 336 }
 337
 338 static void
 339 bpfd_rele(struct bpf_d *d)
 340 {
 341
 342         if (!refcount_release(&d->bd_refcnt))
 343                 return;
 344         NET_EPOCH_CALL(bpfd_free, &d->epoch_ctx);
 345 }
 346
 347 static struct bpf_program_buffer*
 348 bpf_program_buffer_alloc(size_t size, int flags)
 349 {
 350
 351         return (malloc(sizeof(struct bpf_program_buffer) + size,
 352             M_BPF, flags));
 353 }
 354
 355 static void
 356 bpf_program_buffer_free(epoch_context_t ctx)
 357 {
 358         struct bpf_program_buffer *ptr;
 359
 360         ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx);
 361 #ifdef BPF_JITTER
 362         if (ptr->func != NULL)
 363                 bpf_destroy_jit_filter(ptr->func);
 364 #endif
 365         free(ptr, M_BPF);
 366 }
 367
 368 /*
 369  * Wrapper functions for various buffering methods.  If the set of buffer
 370  * modes expands, we will probably want to introduce a switch data structure
 371  * similar to protosw, et.
 372  */
 373 static void
 374 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
 375     u_int len)
 376 {
 377
 378         BPFD_LOCK_ASSERT(d);
 379
 380         switch (d->bd_bufmode) {
 381         case BPF_BUFMODE_BUFFER:
 382                 return (bpf_buffer_append_bytes(d, buf, offset, src, len));
 383
 384         case BPF_BUFMODE_ZBUF:
 385                 counter_u64_add(d->bd_zcopy, 1);
 386                 return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
 387
 388         default:
 389                 panic("bpf_buf_append_bytes");
 390         }
 391 }
 392
 393 static void
 394 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
 395     u_int len)
 396 {
 397
 398         BPFD_LOCK_ASSERT(d);
 399
 400         switch (d->bd_bufmode) {
 401         case BPF_BUFMODE_BUFFER:
 402                 return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
 403
 404         case BPF_BUFMODE_ZBUF:
 405                 counter_u64_add(d->bd_zcopy, 1);
 406                 return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
 407
 408         default:
 409                 panic("bpf_buf_append_mbuf");
 410         }
 411 }
 412
 413 /*
 414  * This function gets called when the free buffer is re-assigned.
 415  */
 416 static void
 417 bpf_buf_reclaimed(struct bpf_d *d)
 418 {
 419
 420         BPFD_LOCK_ASSERT(d);
 421
 422         switch (d->bd_bufmode) {
 423         case BPF_BUFMODE_BUFFER:
 424                 return;
 425
 426         case BPF_BUFMODE_ZBUF:
 427                 bpf_zerocopy_buf_reclaimed(d);
 428                 return;
 429
 430         default:
 431                 panic("bpf_buf_reclaimed");
 432         }
 433 }
 434
 435 /*
 436  * If the buffer mechanism has a way to decide that a held buffer can be made
 437  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
 438  * returned if the buffer can be discarded, (0) is returned if it cannot.
 439  */
 440 static int
 441 bpf_canfreebuf(struct bpf_d *d)
 442 {
 443
 444         BPFD_LOCK_ASSERT(d);
 445
 446         switch (d->bd_bufmode) {
 447         case BPF_BUFMODE_ZBUF:
 448                 return (bpf_zerocopy_canfreebuf(d));
 449         }
 450         return (0);
 451 }
 452
 453 /*
 454  * Allow the buffer model to indicate that the current store buffer is
 455  * immutable, regardless of the appearance of space.  Return (1) if the
 456  * buffer is writable, and (0) if not.
 457  */
 458 static int
 459 bpf_canwritebuf(struct bpf_d *d)
 460 {
 461         BPFD_LOCK_ASSERT(d);
 462
 463         switch (d->bd_bufmode) {
 464         case BPF_BUFMODE_ZBUF:
 465                 return (bpf_zerocopy_canwritebuf(d));
 466         }
 467         return (1);
 468 }
 469
 470 /*
 471  * Notify buffer model that an attempt to write to the store buffer has
 472  * resulted in a dropped packet, in which case the buffer may be considered
 473  * full.
 474  */
 475 static void
 476 bpf_buffull(struct bpf_d *d)
 477 {
 478
 479         BPFD_LOCK_ASSERT(d);
 480
 481         switch (d->bd_bufmode) {
 482         case BPF_BUFMODE_ZBUF:
 483                 bpf_zerocopy_buffull(d);
 484                 break;
 485         }
 486 }
 487
 488 /*
 489  * Notify the buffer model that a buffer has moved into the hold position.
 490  */
 491 void
 492 bpf_bufheld(struct bpf_d *d)
 493 {
 494
 495         BPFD_LOCK_ASSERT(d);
 496
 497         switch (d->bd_bufmode) {
 498         case BPF_BUFMODE_ZBUF:
 499                 bpf_zerocopy_bufheld(d);
 500                 break;
 501         }
 502 }
 503
 504 static void
 505 bpf_free(struct bpf_d *d)
 506 {
 507
 508         switch (d->bd_bufmode) {
 509         case BPF_BUFMODE_BUFFER:
 510                 return (bpf_buffer_free(d));
 511
 512         case BPF_BUFMODE_ZBUF:
 513                 return (bpf_zerocopy_free(d));
 514
 515         default:
 516                 panic("bpf_buf_free");
 517         }
 518 }
 519
 520 static int
 521 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
 522 {
 523
 524         if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 525                 return (EOPNOTSUPP);
 526         return (bpf_buffer_uiomove(d, buf, len, uio));
 527 }
 528
 529 static int
 530 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
 531 {
 532
 533         if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 534                 return (EOPNOTSUPP);
 535         return (bpf_buffer_ioctl_sblen(d, i));
 536 }
 537
 538 static int
 539 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
 540 {
 541
 542         if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 543                 return (EOPNOTSUPP);
 544         return (bpf_zerocopy_ioctl_getzmax(td, d, i));
 545 }
 546
 547 static int
 548 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 549 {
 550
 551         if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 552                 return (EOPNOTSUPP);
 553         return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
 554 }
 555
 556 static int
 557 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 558 {
 559
 560         if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 561                 return (EOPNOTSUPP);
 562         return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
 563 }
 564
 565 /*
 566  * General BPF functions.
 567  */
 568 static int
 569 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
 570     struct sockaddr *sockp, int *hdrlen, struct bpf_d *d)
 571 {
 572         const struct ieee80211_bpf_params *p;
 573         struct ether_header *eh;
 574         struct mbuf *m;
 575         int error;
 576         int len;
 577         int hlen;
 578         int slen;
 579
 580         /*
 581          * Build a sockaddr based on the data link layer type.
 582          * We do this at this level because the ethernet header
 583          * is copied directly into the data field of the sockaddr.
 584          * In the case of SLIP, there is no header and the packet
 585          * is forwarded as is.
 586          * Also, we are careful to leave room at the front of the mbuf
 587          * for the link level header.
 588          */
 589         switch (linktype) {
 590         case DLT_SLIP:
 591                 sockp->sa_family = AF_INET;
 592                 hlen = 0;
 593                 break;
 594
 595         case DLT_EN10MB:
 596                 sockp->sa_family = AF_UNSPEC;
 597                 /* XXX Would MAXLINKHDR be better? */
 598                 hlen = ETHER_HDR_LEN;
 599                 break;
 600
 601         case DLT_FDDI:
 602                 sockp->sa_family = AF_IMPLINK;
 603                 hlen = 0;
 604                 break;
 605
 606         case DLT_RAW:
 607                 sockp->sa_family = AF_UNSPEC;
 608                 hlen = 0;
 609                 break;
 610
 611         case DLT_NULL:
 612                 /*
 613                  * null interface types require a 4 byte pseudo header which
 614                  * corresponds to the address family of the packet.
 615                  */
 616                 sockp->sa_family = AF_UNSPEC;
 617                 hlen = 4;
 618                 break;
 619
 620         case DLT_ATM_RFC1483:
 621                 /*
 622                  * en atm driver requires 4-byte atm pseudo header.
 623                  * though it isn't standard, vpi:vci needs to be
 624                  * specified anyway.
 625                  */
 626                 sockp->sa_family = AF_UNSPEC;
 627                 hlen = 12;      /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 628                 break;
 629
 630         case DLT_PPP:
 631                 sockp->sa_family = AF_UNSPEC;
 632                 hlen = 4;       /* This should match PPP_HDRLEN */
 633                 break;
 634
 635         case DLT_IEEE802_11:            /* IEEE 802.11 wireless */
 636                 sockp->sa_family = AF_IEEE80211;
 637                 hlen = 0;
 638                 break;
 639
 640         case DLT_IEEE802_11_RADIO:      /* IEEE 802.11 wireless w/ phy params */
 641                 sockp->sa_family = AF_IEEE80211;
 642                 sockp->sa_len = 12;     /* XXX != 0 */
 643                 hlen = sizeof(struct ieee80211_bpf_params);
 644                 break;
 645
 646         default:
 647                 return (EIO);
 648         }
 649
 650         len = uio->uio_resid;
 651         if (len < hlen || len - hlen > ifp->if_mtu)
 652                 return (EMSGSIZE);
 653
 654         /* Allocate a mbuf for our write, since m_get2 fails if len >= to MJUMPAGESIZE, use m_getjcl for bigger buffers */
 655         m = m_get3(len, M_WAITOK, MT_DATA, M_PKTHDR);
 656         if (m == NULL)
 657                 return (EIO);
 658         m->m_pkthdr.len = m->m_len = len;
 659         *mp = m;
 660
 661         error = uiomove(mtod(m, u_char *), len, uio);
 662         if (error)
 663                 goto bad;
 664
 665         slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len);
 666         if (slen == 0) {
 667                 error = EPERM;
 668                 goto bad;
 669         }
 670
 671         /* Check for multicast destination */
 672         switch (linktype) {
 673         case DLT_EN10MB:
 674                 eh = mtod(m, struct ether_header *);
 675                 if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 676                         if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
 677                             ETHER_ADDR_LEN) == 0)
 678                                 m->m_flags |= M_BCAST;
 679                         else
 680                                 m->m_flags |= M_MCAST;
 681                 }
 682                 if (d->bd_hdrcmplt == 0) {
 683                         memcpy(eh->ether_shost, IF_LLADDR(ifp),
 684                             sizeof(eh->ether_shost));
 685                 }
 686                 break;
 687         }
 688
 689         /*
 690          * Make room for link header, and copy it to sockaddr
 691          */
 692         if (hlen != 0) {
 693                 if (sockp->sa_family == AF_IEEE80211) {
 694                         /*
 695                          * Collect true length from the parameter header
 696                          * NB: sockp is known to be zero'd so if we do a
 697                          *     short copy unspecified parameters will be
 698                          *     zero.
 699                          * NB: packet may not be aligned after stripping
 700                          *     bpf params
 701                          * XXX check ibp_vers
 702                          */
 703                         p = mtod(m, const struct ieee80211_bpf_params *);
 704                         hlen = p->ibp_len;
 705                         if (hlen > sizeof(sockp->sa_data)) {
 706                                 error = EINVAL;
 707                                 goto bad;
 708                         }
 709                 }
 710                 bcopy(mtod(m, const void *), sockp->sa_data, hlen);
 711         }
 712         *hdrlen = hlen;
 713
 714         return (0);
 715 bad:
 716         m_freem(m);
 717         return (error);
 718 }
 719
 720 /*
 721  * Attach descriptor to the bpf interface, i.e. make d listen on bp,
 722  * then reset its buffers and counters with reset_d().
 723  */
 724 static void
 725 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 726 {
 727         int op_w;
 728
 729         BPF_LOCK_ASSERT();
 730
 731         /*
 732          * Save sysctl value to protect from sysctl change
 733          * between reads
 734          */
 735         op_w = V_bpf_optimize_writers || d->bd_writer;
 736
 737         if (d->bd_bif != NULL)
 738                 bpf_detachd_locked(d, false);
 739         /*
 740          * Point d at bp, and add d to the interface's list.
 741          * Since there are many applications using BPF for
 742          * sending raw packets only (dhcpd, cdpd are good examples)
 743          * we can delay adding d to the list of active listeners until
 744          * some filter is configured.
 745          */
 746
 747         BPFD_LOCK(d);
 748         /*
 749          * Hold reference to bpif while descriptor uses this interface.
 750          */
 751         bpfif_ref(bp);
 752         d->bd_bif = bp;
 753         if (op_w != 0) {
 754                 /* Add to writers-only list */
 755                 CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
 756                 /*
 757                  * We decrement bd_writer on every filter set operation.
 758                  * First BIOCSETF is done by pcap_open_live() to set up
 759                  * snap length. After that appliation usually sets its own
 760                  * filter.
 761                  */
 762                 d->bd_writer = 2;
 763         } else
 764                 CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 765
 766         reset_d(d);
 767
 768         /* Trigger EVFILT_WRITE events. */
 769         bpf_wakeup(d);
 770
 771         BPFD_UNLOCK(d);
 772         bpf_bpfd_cnt++;
 773
 774         CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
 775             __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
 776
 777         if (op_w == 0)
 778                 EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
 779 }
 780
 781 /*
 782  * Check if we need to upgrade our descriptor @d from write-only mode.
 783  */
 784 static int
 785 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode,
 786     int flen)
 787 {
 788         int is_snap, need_upgrade;
 789
 790         /*
 791          * Check if we've already upgraded or new filter is empty.
 792          */
 793         if (d->bd_writer == 0 || fcode == NULL)
 794                 return (0);
 795
 796         need_upgrade = 0;
 797
 798         /*
 799          * Check if cmd looks like snaplen setting from
 800          * pcap_bpf.c:pcap_open_live().
 801          * Note we're not checking .k value here:
 802          * while pcap_open_live() definitely sets to non-zero value,
 803          * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
 804          * do not consider upgrading immediately
 805          */
 806         if (cmd == BIOCSETF && flen == 1 &&
 807             fcode[0].code == (BPF_RET | BPF_K))
 808                 is_snap = 1;
 809         else
 810                 is_snap = 0;
 811
 812         if (is_snap == 0) {
 813                 /*
 814                  * We're setting first filter and it doesn't look like
 815                  * setting snaplen.  We're probably using bpf directly.
 816                  * Upgrade immediately.
 817                  */
 818                 need_upgrade = 1;
 819         } else {
 820                 /*
 821                  * Do not require upgrade by first BIOCSETF
 822                  * (used to set snaplen) by pcap_open_live().
 823                  */
 824
 825                 if (--d->bd_writer == 0) {
 826                         /*
 827                          * First snaplen filter has already
 828                          * been set. This is probably catch-all
 829                          * filter
 830                          */
 831                         need_upgrade = 1;
 832                 }
 833         }
 834
 835         CTR5(KTR_NET,
 836             "%s: filter function set by pid %d, "
 837             "bd_writer counter %d, snap %d upgrade %d",
 838             __func__, d->bd_pid, d->bd_writer,
 839             is_snap, need_upgrade);
 840
 841         return (need_upgrade);
 842 }
 843
 844 /*
 845  * Detach a file from its interface.
 846  */
 847 static void
 848 bpf_detachd(struct bpf_d *d)
 849 {
 850         BPF_LOCK();
 851         bpf_detachd_locked(d, false);
 852         BPF_UNLOCK();
 853 }
 854
 855 static void
 856 bpf_detachd_locked(struct bpf_d *d, bool detached_ifp)
 857 {
 858         struct bpf_if *bp;
 859         struct ifnet *ifp;
 860         int error;
 861
 862         BPF_LOCK_ASSERT();
 863         CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
 864
 865         /* Check if descriptor is attached */
 866         if ((bp = d->bd_bif) == NULL)
 867                 return;
 868
 869         BPFD_LOCK(d);
 870         /* Remove d from the interface's descriptor list. */
 871         CK_LIST_REMOVE(d, bd_next);
 872         /* Save bd_writer value */
 873         error = d->bd_writer;
 874         ifp = bp->bif_ifp;
 875         d->bd_bif = NULL;
 876         if (detached_ifp) {
 877                 /*
 878                  * Notify descriptor as it's detached, so that any
 879                  * sleepers wake up and get ENXIO.
 880                  */
 881                 bpf_wakeup(d);
 882         }
 883         BPFD_UNLOCK(d);
 884         bpf_bpfd_cnt--;
 885
 886         /* Call event handler iff d is attached */
 887         if (error == 0)
 888                 EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
 889
 890         /*
 891          * Check if this descriptor had requested promiscuous mode.
 892          * If so and ifnet is not detached, turn it off.
 893          */
 894         if (d->bd_promisc && !detached_ifp) {
 895                 d->bd_promisc = 0;
 896                 CURVNET_SET(ifp->if_vnet);
 897                 error = ifpromisc(ifp, 0);
 898                 CURVNET_RESTORE();
 899                 if (error != 0 && error != ENXIO) {
 900                         /*
 901                          * ENXIO can happen if a pccard is unplugged
 902                          * Something is really wrong if we were able to put
 903                          * the driver into promiscuous mode, but can't
 904                          * take it out.
 905                          */
 906                         if_printf(bp->bif_ifp,
 907                                 "bpf_detach: ifpromisc failed (%d)\n", error);
 908                 }
 909         }
 910         bpfif_rele(bp);
 911 }
 912
 913 /*
 914  * Close the descriptor by detaching it from its interface,
 915  * deallocating its buffers, and marking it free.
 916  */
 917 static void
 918 bpf_dtor(void *data)
 919 {
 920         struct bpf_d *d = data;
 921
 922         BPFD_LOCK(d);
 923         if (d->bd_state == BPF_WAITING)
 924                 callout_stop(&d->bd_callout);
 925         d->bd_state = BPF_IDLE;
 926         BPFD_UNLOCK(d);
 927         funsetown(&d->bd_sigio);
 928         bpf_detachd(d);
 929 #ifdef MAC
 930         mac_bpfdesc_destroy(d);
 931 #endif /* MAC */
 932         seldrain(&d->bd_sel);
 933         knlist_destroy(&d->bd_sel.si_note);
 934         callout_drain(&d->bd_callout);
 935         bpfd_rele(d);
 936 }
 937
 938 /*
 939  * Open ethernet device.  Returns ENXIO for illegal minor device number,
 940  * EBUSY if file is open by another process.
 941  */
 942 /* ARGSUSED */
 943 static  int
 944 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 945 {
 946         struct bpf_d *d;
 947         int error;
 948
 949         d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 950         error = devfs_set_cdevpriv(d, bpf_dtor);
 951         if (error != 0) {
 952                 free(d, M_BPF);
 953                 return (error);
 954         }
 955
 956         /* Setup counters */
 957         d->bd_rcount = counter_u64_alloc(M_WAITOK);
 958         d->bd_dcount = counter_u64_alloc(M_WAITOK);
 959         d->bd_fcount = counter_u64_alloc(M_WAITOK);
 960         d->bd_wcount = counter_u64_alloc(M_WAITOK);
 961         d->bd_wfcount = counter_u64_alloc(M_WAITOK);
 962         d->bd_wdcount = counter_u64_alloc(M_WAITOK);
 963         d->bd_zcopy = counter_u64_alloc(M_WAITOK);
 964
 965         /*
 966          * For historical reasons, perform a one-time initialization call to
 967          * the buffer routines, even though we're not yet committed to a
 968          * particular buffer method.
 969          */
 970         bpf_buffer_init(d);
 971         if ((flags & FREAD) == 0)
 972                 d->bd_writer = 2;
 973         d->bd_hbuf_in_use = 0;
 974         d->bd_bufmode = BPF_BUFMODE_BUFFER;
 975         d->bd_sig = SIGIO;
 976         d->bd_direction = BPF_D_INOUT;
 977         refcount_init(&d->bd_refcnt, 1);
 978         BPF_PID_REFRESH(d, td);
 979 #ifdef MAC
 980         mac_bpfdesc_init(d);
 981         mac_bpfdesc_create(td->td_ucred, d);
 982 #endif
 983         mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
 984         callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
 985         knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
 986
 987         /* Disable VLAN pcp tagging. */
 988         d->bd_pcp = 0;
 989
 990         return (0);
 991 }
 992
 993 /*
 994  *  bpfread - read next chunk of packets from buffers
 995  */
 996 static  int
 997 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 998 {
 999         struct bpf_d *d;
1000         int error;
1001         int non_block;
1002         int timed_out;
1003
1004         error = devfs_get_cdevpriv((void **)&d);
1005         if (error != 0)
1006                 return (error);
1007
1008         /*
1009          * Restrict application to use a buffer the same size as
1010          * as kernel buffers.
1011          */
1012         if (uio->uio_resid != d->bd_bufsize)
1013                 return (EINVAL);
1014
1015         non_block = ((ioflag & O_NONBLOCK) != 0);
1016
1017         BPFD_LOCK(d);
1018         BPF_PID_REFRESH_CUR(d);
1019         if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
1020                 BPFD_UNLOCK(d);
1021                 return (EOPNOTSUPP);
1022         }
1023         if (d->bd_state == BPF_WAITING)
1024                 callout_stop(&d->bd_callout);
1025         timed_out = (d->bd_state == BPF_TIMED_OUT);
1026         d->bd_state = BPF_IDLE;
1027         while (d->bd_hbuf_in_use) {
1028                 error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
1029                     PRINET|PCATCH, "bd_hbuf", 0);
1030                 if (error != 0) {
1031                         BPFD_UNLOCK(d);
1032                         return (error);
1033                 }
1034         }
1035         /*
1036          * If the hold buffer is empty, then do a timed sleep, which
1037          * ends when the timeout expires or when enough packets
1038          * have arrived to fill the store buffer.
1039          */
1040         while (d->bd_hbuf == NULL) {
1041                 if (d->bd_slen != 0) {
1042                         /*
1043                          * A packet(s) either arrived since the previous
1044                          * read or arrived while we were asleep.
1045                          */
1046                         if (d->bd_immediate || non_block || timed_out) {
1047                                 /*
1048                                  * Rotate the buffers and return what's here
1049                                  * if we are in immediate mode, non-blocking
1050                                  * flag is set, or this descriptor timed out.
1051                                  */
1052                                 ROTATE_BUFFERS(d);
1053                                 break;
1054                         }
1055                 }
1056
1057                 /*
1058                  * No data is available, check to see if the bpf device
1059                  * is still pointed at a real interface.  If not, return
1060                  * ENXIO so that the userland process knows to rebind
1061                  * it before using it again.
1062                  */
1063                 if (d->bd_bif == NULL) {
1064                         BPFD_UNLOCK(d);
1065                         return (ENXIO);
1066                 }
1067
1068                 if (non_block) {
1069                         BPFD_UNLOCK(d);
1070                         return (EWOULDBLOCK);
1071                 }
1072                 error = msleep(d, &d->bd_lock, PRINET|PCATCH,
1073                      "bpf", d->bd_rtout);
1074                 if (error == EINTR || error == ERESTART) {
1075                         BPFD_UNLOCK(d);
1076                         return (error);
1077                 }
1078                 if (error == EWOULDBLOCK) {
1079                         /*
1080                          * On a timeout, return what's in the buffer,
1081                          * which may be nothing.  If there is something
1082                          * in the store buffer, we can rotate the buffers.
1083                          */
1084                         if (d->bd_hbuf)
1085                                 /*
1086                                  * We filled up the buffer in between
1087                                  * getting the timeout and arriving
1088                                  * here, so we don't need to rotate.
1089                                  */
1090                                 break;
1091
1092                         if (d->bd_slen == 0) {
1093                                 BPFD_UNLOCK(d);
1094                                 return (0);
1095                         }
1096                         ROTATE_BUFFERS(d);
1097                         break;
1098                 }
1099         }
1100         /*
1101          * At this point, we know we have something in the hold slot.
1102          */
1103         d->bd_hbuf_in_use = 1;
1104         BPFD_UNLOCK(d);
1105
1106         /*
1107          * Move data from hold buffer into user space.
1108          * We know the entire buffer is transferred since
1109          * we checked above that the read buffer is bpf_bufsize bytes.
1110          *
1111          * We do not have to worry about simultaneous reads because
1112          * we waited for sole access to the hold buffer above.
1113          */
1114         error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
1115
1116         BPFD_LOCK(d);
1117         KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
1118         d->bd_fbuf = d->bd_hbuf;
1119         d->bd_hbuf = NULL;
1120         d->bd_hlen = 0;
1121         bpf_buf_reclaimed(d);
1122         d->bd_hbuf_in_use = 0;
1123         wakeup(&d->bd_hbuf_in_use);
1124         BPFD_UNLOCK(d);
1125
1126         return (error);
1127 }
1128
1129 /*
1130  * If there are processes sleeping on this descriptor, wake them up.
1131  */
1132 static __inline void
1133 bpf_wakeup(struct bpf_d *d)
1134 {
1135
1136         BPFD_LOCK_ASSERT(d);
1137         if (d->bd_state == BPF_WAITING) {
1138                 callout_stop(&d->bd_callout);
1139                 d->bd_state = BPF_IDLE;
1140         }
1141         wakeup(d);
1142         if (d->bd_async && d->bd_sig && d->bd_sigio)
1143                 pgsigio(&d->bd_sigio, d->bd_sig, 0);
1144
1145         selwakeuppri(&d->bd_sel, PRINET);
1146         KNOTE_LOCKED(&d->bd_sel.si_note, 0);
1147 }
1148
1149 static void
1150 bpf_timed_out(void *arg)
1151 {
1152         struct bpf_d *d = (struct bpf_d *)arg;
1153
1154         BPFD_LOCK_ASSERT(d);
1155
1156         if (callout_pending(&d->bd_callout) ||
1157             !callout_active(&d->bd_callout))
1158                 return;
1159         if (d->bd_state == BPF_WAITING) {
1160                 d->bd_state = BPF_TIMED_OUT;
1161                 if (d->bd_slen != 0)
1162                         bpf_wakeup(d);
1163         }
1164 }
1165
1166 static int
1167 bpf_ready(struct bpf_d *d)
1168 {
1169
1170         BPFD_LOCK_ASSERT(d);
1171
1172         if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
1173                 return (1);
1174         if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1175             d->bd_slen != 0)
1176                 return (1);
1177         return (0);
1178 }
1179
1180 static int
1181 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
1182 {
1183         struct route ro;
1184         struct sockaddr dst;
1185         struct epoch_tracker et;
1186         struct bpf_if *bp;
1187         struct bpf_d *d;
1188         struct ifnet *ifp;
1189         struct mbuf *m, *mc;
1190         int error, hlen;
1191
1192         error = devfs_get_cdevpriv((void **)&d);
1193         if (error != 0)
1194                 return (error);
1195
1196         NET_EPOCH_ENTER(et);
1197         BPFD_LOCK(d);
1198         BPF_PID_REFRESH_CUR(d);
1199         counter_u64_add(d->bd_wcount, 1);
1200         if ((bp = d->bd_bif) == NULL) {
1201                 error = ENXIO;
1202                 goto out_locked;
1203         }
1204
1205         ifp = bp->bif_ifp;
1206         if ((ifp->if_flags & IFF_UP) == 0) {
1207                 error = ENETDOWN;
1208                 goto out_locked;
1209         }
1210
1211         if (uio->uio_resid == 0)
1212                 goto out_locked;
1213
1214         bzero(&dst, sizeof(dst));
1215         m = NULL;
1216         hlen = 0;
1217
1218         /*
1219          * Take extra reference, unlock d and exit from epoch section,
1220          * since bpf_movein() can sleep.
1221          */
1222         bpfd_ref(d);
1223         NET_EPOCH_EXIT(et);
1224         BPFD_UNLOCK(d);
1225
1226         error = bpf_movein(uio, (int)bp->bif_dlt, ifp,
1227             &m, &dst, &hlen, d);
1228
1229         if (error != 0) {
1230                 counter_u64_add(d->bd_wdcount, 1);
1231                 bpfd_rele(d);
1232                 return (error);
1233         }
1234
1235         BPFD_LOCK(d);
1236         /*
1237          * Check that descriptor is still attached to the interface.
1238          * This can happen on bpfdetach(). To avoid access to detached
1239          * ifnet, free mbuf and return ENXIO.
1240          */
1241         if (d->bd_bif == NULL) {
1242                 counter_u64_add(d->bd_wdcount, 1);
1243                 BPFD_UNLOCK(d);
1244                 bpfd_rele(d);
1245                 m_freem(m);
1246                 return (ENXIO);
1247         }
1248         counter_u64_add(d->bd_wfcount, 1);
1249         if (d->bd_hdrcmplt)
1250                 dst.sa_family = pseudo_AF_HDRCMPLT;
1251
1252         if (d->bd_feedback) {
1253                 mc = m_dup(m, M_NOWAIT);
1254                 if (mc != NULL)
1255                         mc->m_pkthdr.rcvif = ifp;
1256                 /* Set M_PROMISC for outgoing packets to be discarded. */
1257                 if (d->bd_direction == BPF_D_INOUT)
1258                         m->m_flags |= M_PROMISC;
1259         } else
1260                 mc = NULL;
1261
1262         m->m_pkthdr.len -= hlen;
1263         m->m_len -= hlen;
1264         m->m_data += hlen;      /* XXX */
1265
1266         CURVNET_SET(ifp->if_vnet);
1267 #ifdef MAC
1268         mac_bpfdesc_create_mbuf(d, m);
1269         if (mc != NULL)
1270                 mac_bpfdesc_create_mbuf(d, mc);
1271 #endif
1272
1273         bzero(&ro, sizeof(ro));
1274         if (hlen != 0) {
1275                 ro.ro_prepend = (u_char *)&dst.sa_data;
1276                 ro.ro_plen = hlen;
1277                 ro.ro_flags = RT_HAS_HEADER;
1278         }
1279
1280         if (d->bd_pcp != 0)
1281                 vlan_set_pcp(m, d->bd_pcp);
1282
1283         /* Avoid possible recursion on BPFD_LOCK(). */
1284         NET_EPOCH_ENTER(et);
1285         BPFD_UNLOCK(d);
1286         error = (*ifp->if_output)(ifp, m, &dst, &ro);
1287         if (error)
1288                 counter_u64_add(d->bd_wdcount, 1);
1289
1290         if (mc != NULL) {
1291                 if (error == 0)
1292                         (*ifp->if_input)(ifp, mc);
1293                 else
1294                         m_freem(mc);
1295         }
1296         NET_EPOCH_EXIT(et);
1297         CURVNET_RESTORE();
1298         bpfd_rele(d);
1299         return (error);
1300
1301 out_locked:
1302         counter_u64_add(d->bd_wdcount, 1);
1303         NET_EPOCH_EXIT(et);
1304         BPFD_UNLOCK(d);
1305         return (error);
1306 }
1307
1308 /*
1309  * Reset a descriptor by flushing its packet buffer and clearing the receive
1310  * and drop counts.  This is doable for kernel-only buffers, but with
1311  * zero-copy buffers, we can't write to (or rotate) buffers that are
1312  * currently owned by userspace.  It would be nice if we could encapsulate
1313  * this logic in the buffer code rather than here.
1314  */
1315 static void
1316 reset_d(struct bpf_d *d)
1317 {
1318
1319         BPFD_LOCK_ASSERT(d);
1320
1321         while (d->bd_hbuf_in_use)
1322                 mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET,
1323                     "bd_hbuf", 0);
1324         if ((d->bd_hbuf != NULL) &&
1325             (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
1326                 /* Free the hold buffer. */
1327                 d->bd_fbuf = d->bd_hbuf;
1328                 d->bd_hbuf = NULL;
1329                 d->bd_hlen = 0;
1330                 bpf_buf_reclaimed(d);
1331         }
1332         if (bpf_canwritebuf(d))
1333                 d->bd_slen = 0;
1334         counter_u64_zero(d->bd_rcount);
1335         counter_u64_zero(d->bd_dcount);
1336         counter_u64_zero(d->bd_fcount);
1337         counter_u64_zero(d->bd_wcount);
1338         counter_u64_zero(d->bd_wfcount);
1339         counter_u64_zero(d->bd_wdcount);
1340         counter_u64_zero(d->bd_zcopy);
1341 }
1342
1343 /*
1344  *  FIONREAD            Check for read packet available.
1345  *  BIOCGBLEN           Get buffer len [for read()].
1346  *  BIOCSETF            Set read filter.
1347  *  BIOCSETFNR          Set read filter without resetting descriptor.
1348  *  BIOCSETWF           Set write filter.
1349  *  BIOCFLUSH           Flush read packet buffer.
1350  *  BIOCPROMISC         Put interface into promiscuous mode.
1351  *  BIOCGDLT            Get link layer type.
1352  *  BIOCGETIF           Get interface name.
1353  *  BIOCSETIF           Set interface.
1354  *  BIOCSRTIMEOUT       Set read timeout.
1355  *  BIOCGRTIMEOUT       Get read timeout.
1356  *  BIOCGSTATS          Get packet stats.
1357  *  BIOCIMMEDIATE       Set immediate mode.
1358  *  BIOCVERSION         Get filter language version.
1359  *  BIOCGHDRCMPLT       Get "header already complete" flag
1360  *  BIOCSHDRCMPLT       Set "header already complete" flag
1361  *  BIOCGDIRECTION      Get packet direction flag
1362  *  BIOCSDIRECTION      Set packet direction flag
1363  *  BIOCGTSTAMP         Get time stamp format and resolution.
1364  *  BIOCSTSTAMP         Set time stamp format and resolution.
1365  *  BIOCLOCK            Set "locked" flag
1366  *  BIOCFEEDBACK        Set packet feedback mode.
1367  *  BIOCSETZBUF         Set current zero-copy buffer locations.
1368  *  BIOCGETZMAX         Get maximum zero-copy buffer size.
1369  *  BIOCROTZBUF         Force rotation of zero-copy buffer
1370  *  BIOCSETBUFMODE      Set buffer mode.
1371  *  BIOCGETBUFMODE      Get current buffer mode.
1372  *  BIOCSETVLANPCP      Set VLAN PCP tag.
1373  */
1374 /* ARGSUSED */
1375 static  int
1376 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
1377     struct thread *td)
1378 {
1379         struct bpf_d *d;
1380         int error;
1381
1382         error = devfs_get_cdevpriv((void **)&d);
1383         if (error != 0)
1384                 return (error);
1385
1386         /*
1387          * Refresh PID associated with this descriptor.
1388          */
1389         BPFD_LOCK(d);
1390         BPF_PID_REFRESH(d, td);
1391         if (d->bd_state == BPF_WAITING)
1392                 callout_stop(&d->bd_callout);
1393         d->bd_state = BPF_IDLE;
1394         BPFD_UNLOCK(d);
1395
1396         if (d->bd_locked == 1) {
1397                 switch (cmd) {
1398                 case BIOCGBLEN:
1399                 case BIOCFLUSH:
1400                 case BIOCGDLT:
1401                 case BIOCGDLTLIST:
1402 #ifdef COMPAT_FREEBSD32
1403                 case BIOCGDLTLIST32:
1404 #endif
1405                 case BIOCGETIF:
1406                 case BIOCGRTIMEOUT:
1407 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1408                 case BIOCGRTIMEOUT32:
1409 #endif
1410                 case BIOCGSTATS:
1411                 case BIOCVERSION:
1412                 case BIOCGRSIG:
1413                 case BIOCGHDRCMPLT:
1414                 case BIOCSTSTAMP:
1415                 case BIOCFEEDBACK:
1416                 case FIONREAD:
1417                 case BIOCLOCK:
1418                 case BIOCSRTIMEOUT:
1419 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1420                 case BIOCSRTIMEOUT32:
1421 #endif
1422                 case BIOCIMMEDIATE:
1423                 case TIOCGPGRP:
1424                 case BIOCROTZBUF:
1425                         break;
1426                 default:
1427                         return (EPERM);
1428                 }
1429         }
1430 #ifdef COMPAT_FREEBSD32
1431         /*
1432          * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
1433          * that it will get 32-bit packet headers.
1434          */
1435         switch (cmd) {
1436         case BIOCSETF32:
1437         case BIOCSETFNR32:
1438         case BIOCSETWF32:
1439         case BIOCGDLTLIST32:
1440         case BIOCGRTIMEOUT32:
1441         case BIOCSRTIMEOUT32:
1442                 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1443                         BPFD_LOCK(d);
1444                         d->bd_compat32 = 1;
1445                         BPFD_UNLOCK(d);
1446                 }
1447         }
1448 #endif
1449
1450         CURVNET_SET(TD_TO_VNET(td));
1451         switch (cmd) {
1452         default:
1453                 error = EINVAL;
1454                 break;
1455
1456         /*
1457          * Check for read packet available.
1458          */
1459         case FIONREAD:
1460                 {
1461                         int n;
1462
1463                         BPFD_LOCK(d);
1464                         n = d->bd_slen;
1465                         while (d->bd_hbuf_in_use)
1466                                 mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
1467                                     PRINET, "bd_hbuf", 0);
1468                         if (d->bd_hbuf)
1469                                 n += d->bd_hlen;
1470                         BPFD_UNLOCK(d);
1471
1472                         *(int *)addr = n;
1473                         break;
1474                 }
1475
1476         /*
1477          * Get buffer len [for read()].
1478          */
1479         case BIOCGBLEN:
1480                 BPFD_LOCK(d);
1481                 *(u_int *)addr = d->bd_bufsize;
1482                 BPFD_UNLOCK(d);
1483                 break;
1484
1485         /*
1486          * Set buffer length.
1487          */
1488         case BIOCSBLEN:
1489                 error = bpf_ioctl_sblen(d, (u_int *)addr);
1490                 break;
1491
1492         /*
1493          * Set link layer read filter.
1494          */
1495         case BIOCSETF:
1496         case BIOCSETFNR:
1497         case BIOCSETWF:
1498 #ifdef COMPAT_FREEBSD32
1499         case BIOCSETF32:
1500         case BIOCSETFNR32:
1501         case BIOCSETWF32:
1502 #endif
1503                 error = bpf_setf(d, (struct bpf_program *)addr, cmd);
1504                 break;
1505
1506         /*
1507          * Flush read packet buffer.
1508          */
1509         case BIOCFLUSH:
1510                 BPFD_LOCK(d);
1511                 reset_d(d);
1512                 BPFD_UNLOCK(d);
1513                 break;
1514
1515         /*
1516          * Put interface into promiscuous mode.
1517          */
1518         case BIOCPROMISC:
1519                 BPF_LOCK();
1520                 if (d->bd_bif == NULL) {
1521                         /*
1522                          * No interface attached yet.
1523                          */
1524                         error = EINVAL;
1525                 } else if (d->bd_promisc == 0) {
1526                         error = ifpromisc(d->bd_bif->bif_ifp, 1);
1527                         if (error == 0)
1528                                 d->bd_promisc = 1;
1529                 }
1530                 BPF_UNLOCK();
1531                 break;
1532
1533         /*
1534          * Get current data link type.
1535          */
1536         case BIOCGDLT:
1537                 BPF_LOCK();
1538                 if (d->bd_bif == NULL)
1539                         error = EINVAL;
1540                 else
1541                         *(u_int *)addr = d->bd_bif->bif_dlt;
1542                 BPF_UNLOCK();
1543                 break;
1544
1545         /*
1546          * Get a list of supported data link types.
1547          */
1548 #ifdef COMPAT_FREEBSD32
1549         case BIOCGDLTLIST32:
1550                 {
1551                         struct bpf_dltlist32 *list32;
1552                         struct bpf_dltlist dltlist;
1553
1554                         list32 = (struct bpf_dltlist32 *)addr;
1555                         dltlist.bfl_len = list32->bfl_len;
1556                         dltlist.bfl_list = PTRIN(list32->bfl_list);
1557                         BPF_LOCK();
1558                         if (d->bd_bif == NULL)
1559                                 error = EINVAL;
1560                         else {
1561                                 error = bpf_getdltlist(d, &dltlist);
1562                                 if (error == 0)
1563                                         list32->bfl_len = dltlist.bfl_len;
1564                         }
1565                         BPF_UNLOCK();
1566                         break;
1567                 }
1568 #endif
1569
1570         case BIOCGDLTLIST:
1571                 BPF_LOCK();
1572                 if (d->bd_bif == NULL)
1573                         error = EINVAL;
1574                 else
1575                         error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
1576                 BPF_UNLOCK();
1577                 break;
1578
1579         /*
1580          * Set data link type.
1581          */
1582         case BIOCSDLT:
1583                 BPF_LOCK();
1584                 if (d->bd_bif == NULL)
1585                         error = EINVAL;
1586                 else
1587                         error = bpf_setdlt(d, *(u_int *)addr);
1588                 BPF_UNLOCK();
1589                 break;
1590
1591         /*
1592          * Get interface name.
1593          */
1594         case BIOCGETIF:
1595                 BPF_LOCK();
1596                 if (d->bd_bif == NULL)
1597                         error = EINVAL;
1598                 else {
1599                         struct ifnet *const ifp = d->bd_bif->bif_ifp;
1600                         struct ifreq *const ifr = (struct ifreq *)addr;
1601
1602                         strlcpy(ifr->ifr_name, ifp->if_xname,
1603                             sizeof(ifr->ifr_name));
1604                 }
1605                 BPF_UNLOCK();
1606                 break;
1607
1608         /*
1609          * Set interface.
1610          */
1611         case BIOCSETIF:
1612                 {
1613                         int alloc_buf, size;
1614
1615                         /*
1616                          * Behavior here depends on the buffering model.  If
1617                          * we're using kernel memory buffers, then we can
1618                          * allocate them here.  If we're using zero-copy,
1619                          * then the user process must have registered buffers
1620                          * by the time we get here.
1621                          */
1622                         alloc_buf = 0;
1623                         BPFD_LOCK(d);
1624                         if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
1625                             d->bd_sbuf == NULL)
1626                                 alloc_buf = 1;
1627                         BPFD_UNLOCK(d);
1628                         if (alloc_buf) {
1629                                 size = d->bd_bufsize;
1630                                 error = bpf_buffer_ioctl_sblen(d, &size);
1631                                 if (error != 0)
1632                                         break;
1633                         }
1634                         BPF_LOCK();
1635                         error = bpf_setif(d, (struct ifreq *)addr);
1636                         BPF_UNLOCK();
1637                         break;
1638                 }
1639
1640         /*
1641          * Set read timeout.
1642          */
1643         case BIOCSRTIMEOUT:
1644 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1645         case BIOCSRTIMEOUT32:
1646 #endif
1647                 {
1648                         struct timeval *tv = (struct timeval *)addr;
1649 #if defined(COMPAT_FREEBSD32)
1650                         struct timeval32 *tv32;
1651                         struct timeval tv64;
1652
1653                         if (cmd == BIOCSRTIMEOUT32) {
1654                                 tv32 = (struct timeval32 *)addr;
1655                                 tv = &tv64;
1656                                 tv->tv_sec = tv32->tv_sec;
1657                                 tv->tv_usec = tv32->tv_usec;
1658                         } else
1659 #endif
1660                                 tv = (struct timeval *)addr;
1661
1662                         /*
1663                          * Subtract 1 tick from tvtohz() since this isn't
1664                          * a one-shot timer.
1665                          */
1666                         if ((error = itimerfix(tv)) == 0)
1667                                 d->bd_rtout = tvtohz(tv) - 1;
1668                         break;
1669                 }
1670
1671         /*
1672          * Get read timeout.
1673          */
1674         case BIOCGRTIMEOUT:
1675 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1676         case BIOCGRTIMEOUT32:
1677 #endif
1678                 {
1679                         struct timeval *tv;
1680 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1681                         struct timeval32 *tv32;
1682                         struct timeval tv64;
1683
1684                         if (cmd == BIOCGRTIMEOUT32)
1685                                 tv = &tv64;
1686                         else
1687 #endif
1688                                 tv = (struct timeval *)addr;
1689
1690                         tv->tv_sec = d->bd_rtout / hz;
1691                         tv->tv_usec = (d->bd_rtout % hz) * tick;
1692 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1693                         if (cmd == BIOCGRTIMEOUT32) {
1694                                 tv32 = (struct timeval32 *)addr;
1695                                 tv32->tv_sec = tv->tv_sec;
1696                                 tv32->tv_usec = tv->tv_usec;
1697                         }
1698 #endif
1699
1700                         break;
1701                 }
1702
1703         /*
1704          * Get packet stats.
1705          */
1706         case BIOCGSTATS:
1707                 {
1708                         struct bpf_stat *bs = (struct bpf_stat *)addr;
1709
1710                         /* XXXCSJP overflow */
1711                         bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount);
1712                         bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount);
1713                         break;
1714                 }
1715
1716         /*
1717          * Set immediate mode.
1718          */
1719         case BIOCIMMEDIATE:
1720                 BPFD_LOCK(d);
1721                 d->bd_immediate = *(u_int *)addr;
1722                 BPFD_UNLOCK(d);
1723                 break;
1724
1725         case BIOCVERSION:
1726                 {
1727                         struct bpf_version *bv = (struct bpf_version *)addr;
1728
1729                         bv->bv_major = BPF_MAJOR_VERSION;
1730                         bv->bv_minor = BPF_MINOR_VERSION;
1731                         break;
1732                 }
1733
1734         /*
1735          * Get "header already complete" flag
1736          */
1737         case BIOCGHDRCMPLT:
1738                 BPFD_LOCK(d);
1739                 *(u_int *)addr = d->bd_hdrcmplt;
1740                 BPFD_UNLOCK(d);
1741                 break;
1742
1743         /*
1744          * Set "header already complete" flag
1745          */
1746         case BIOCSHDRCMPLT:
1747                 BPFD_LOCK(d);
1748                 d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1749                 BPFD_UNLOCK(d);
1750                 break;
1751
1752         /*
1753          * Get packet direction flag
1754          */
1755         case BIOCGDIRECTION:
1756                 BPFD_LOCK(d);
1757                 *(u_int *)addr = d->bd_direction;
1758                 BPFD_UNLOCK(d);
1759                 break;
1760
1761         /*
1762          * Set packet direction flag
1763          */
1764         case BIOCSDIRECTION:
1765                 {
1766                         u_int   direction;
1767
1768                         direction = *(u_int *)addr;
1769                         switch (direction) {
1770                         case BPF_D_IN:
1771                         case BPF_D_INOUT:
1772                         case BPF_D_OUT:
1773                                 BPFD_LOCK(d);
1774                                 d->bd_direction = direction;
1775                                 BPFD_UNLOCK(d);
1776                                 break;
1777                         default:
1778                                 error = EINVAL;
1779                         }
1780                 }
1781                 break;
1782
1783         /*
1784          * Get packet timestamp format and resolution.
1785          */
1786         case BIOCGTSTAMP:
1787                 BPFD_LOCK(d);
1788                 *(u_int *)addr = d->bd_tstamp;
1789                 BPFD_UNLOCK(d);
1790                 break;
1791
1792         /*
1793          * Set packet timestamp format and resolution.
1794          */
1795         case BIOCSTSTAMP:
1796                 {
1797                         u_int   func;
1798
1799                         func = *(u_int *)addr;
1800                         if (BPF_T_VALID(func))
1801                                 d->bd_tstamp = func;
1802                         else
1803                                 error = EINVAL;
1804                 }
1805                 break;
1806
1807         case BIOCFEEDBACK:
1808                 BPFD_LOCK(d);
1809                 d->bd_feedback = *(u_int *)addr;
1810                 BPFD_UNLOCK(d);
1811                 break;
1812
1813         case BIOCLOCK:
1814                 BPFD_LOCK(d);
1815                 d->bd_locked = 1;
1816                 BPFD_UNLOCK(d);
1817                 break;
1818
1819         case FIONBIO:           /* Non-blocking I/O */
1820                 break;
1821
1822         case FIOASYNC:          /* Send signal on receive packets */
1823                 BPFD_LOCK(d);
1824                 d->bd_async = *(int *)addr;
1825                 BPFD_UNLOCK(d);
1826                 break;
1827
1828         case FIOSETOWN:
1829                 /*
1830                  * XXX: Add some sort of locking here?
1831                  * fsetown() can sleep.
1832                  */
1833                 error = fsetown(*(int *)addr, &d->bd_sigio);
1834                 break;
1835
1836         case FIOGETOWN:
1837                 BPFD_LOCK(d);
1838                 *(int *)addr = fgetown(&d->bd_sigio);
1839                 BPFD_UNLOCK(d);
1840                 break;
1841
1842         /* This is deprecated, FIOSETOWN should be used instead. */
1843         case TIOCSPGRP:
1844                 error = fsetown(-(*(int *)addr), &d->bd_sigio);
1845                 break;
1846
1847         /* This is deprecated, FIOGETOWN should be used instead. */
1848         case TIOCGPGRP:
1849                 *(int *)addr = -fgetown(&d->bd_sigio);
1850                 break;
1851
1852         case BIOCSRSIG:         /* Set receive signal */
1853                 {
1854                         u_int sig;
1855
1856                         sig = *(u_int *)addr;
1857
1858                         if (sig >= NSIG)
1859                                 error = EINVAL;
1860                         else {
1861                                 BPFD_LOCK(d);
1862                                 d->bd_sig = sig;
1863                                 BPFD_UNLOCK(d);
1864                         }
1865                         break;
1866                 }
1867         case BIOCGRSIG:
1868                 BPFD_LOCK(d);
1869                 *(u_int *)addr = d->bd_sig;
1870                 BPFD_UNLOCK(d);
1871                 break;
1872
1873         case BIOCGETBUFMODE:
1874                 BPFD_LOCK(d);
1875                 *(u_int *)addr = d->bd_bufmode;
1876                 BPFD_UNLOCK(d);
1877                 break;
1878
1879         case BIOCSETBUFMODE:
1880                 /*
1881                  * Allow the buffering mode to be changed as long as we
1882                  * haven't yet committed to a particular mode.  Our
1883                  * definition of commitment, for now, is whether or not a
1884                  * buffer has been allocated or an interface attached, since
1885                  * that's the point where things get tricky.
1886                  */
1887                 switch (*(u_int *)addr) {
1888                 case BPF_BUFMODE_BUFFER:
1889                         break;
1890
1891                 case BPF_BUFMODE_ZBUF:
1892                         if (bpf_zerocopy_enable)
1893                                 break;
1894                         /* FALLSTHROUGH */
1895
1896                 default:
1897                         CURVNET_RESTORE();
1898                         return (EINVAL);
1899                 }
1900
1901                 BPFD_LOCK(d);
1902                 if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
1903                     d->bd_fbuf != NULL || d->bd_bif != NULL) {
1904                         BPFD_UNLOCK(d);
1905                         CURVNET_RESTORE();
1906                         return (EBUSY);
1907                 }
1908                 d->bd_bufmode = *(u_int *)addr;
1909                 BPFD_UNLOCK(d);
1910                 break;
1911
1912         case BIOCGETZMAX:
1913                 error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
1914                 break;
1915
1916         case BIOCSETZBUF:
1917                 error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
1918                 break;
1919
1920         case BIOCROTZBUF:
1921                 error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
1922                 break;
1923
1924         case BIOCSETVLANPCP:
1925                 {
1926                         u_int pcp;
1927
1928                         pcp = *(u_int *)addr;
1929                         if (pcp > BPF_PRIO_MAX || pcp < 0) {
1930                                 error = EINVAL;
1931                                 break;
1932                         }
1933                         d->bd_pcp = pcp;
1934                         break;
1935                 }
1936         }
1937         CURVNET_RESTORE();
1938         return (error);
1939 }
1940
1941 /*
1942  * Set d's packet filter program to fp. If this file already has a filter,
1943  * free it and replace it. Returns EINVAL for bogus requests.
1944  *
1945  * Note we use global lock here to serialize bpf_setf() and bpf_setif()
1946  * calls.
1947  */
1948 static int
1949 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1950 {
1951 #ifdef COMPAT_FREEBSD32
1952         struct bpf_program fp_swab;
1953         struct bpf_program32 *fp32;
1954 #endif
1955         struct bpf_program_buffer *fcode;
1956         struct bpf_insn *filter;
1957 #ifdef BPF_JITTER
1958         bpf_jit_filter *jfunc;
1959 #endif
1960         size_t size;
1961         u_int flen;
1962         bool track_event;
1963
1964 #ifdef COMPAT_FREEBSD32
1965         switch (cmd) {
1966         case BIOCSETF32:
1967         case BIOCSETWF32:
1968         case BIOCSETFNR32:
1969                 fp32 = (struct bpf_program32 *)fp;
1970                 fp_swab.bf_len = fp32->bf_len;
1971                 fp_swab.bf_insns =
1972                     (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
1973                 fp = &fp_swab;
1974                 switch (cmd) {
1975                 case BIOCSETF32:
1976                         cmd = BIOCSETF;
1977                         break;
1978                 case BIOCSETWF32:
1979                         cmd = BIOCSETWF;
1980                         break;
1981                 }
1982                 break;
1983         }
1984 #endif
1985
1986         filter = NULL;
1987 #ifdef BPF_JITTER
1988         jfunc = NULL;
1989 #endif
1990         /*
1991          * Check new filter validness before acquiring any locks.
1992          * Allocate memory for new filter, if needed.
1993          */
1994         flen = fp->bf_len;
1995         if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
1996                 return (EINVAL);
1997         size = flen * sizeof(*fp->bf_insns);
1998         if (size > 0) {
1999                 /* We're setting up new filter. Copy and check actual data. */
2000                 fcode = bpf_program_buffer_alloc(size, M_WAITOK);
2001                 filter = (struct bpf_insn *)fcode->buffer;
2002                 if (copyin(fp->bf_insns, filter, size) != 0 ||
2003                     !bpf_validate(filter, flen)) {
2004                         free(fcode, M_BPF);
2005                         return (EINVAL);
2006                 }
2007 #ifdef BPF_JITTER
2008                 if (cmd != BIOCSETWF) {
2009                         /*
2010                          * Filter is copied inside fcode and is
2011                          * perfectly valid.
2012                          */
2013                         jfunc = bpf_jitter(filter, flen);
2014                 }
2015 #endif
2016         }
2017
2018         track_event = false;
2019         fcode = NULL;
2020
2021         BPF_LOCK();
2022         BPFD_LOCK(d);
2023         /* Set up new filter. */
2024         if (cmd == BIOCSETWF) {
2025                 if (d->bd_wfilter != NULL) {
2026                         fcode = __containerof((void *)d->bd_wfilter,
2027                             struct bpf_program_buffer, buffer);
2028 #ifdef BPF_JITTER
2029                         fcode->func = NULL;
2030 #endif
2031                 }
2032                 d->bd_wfilter = filter;
2033         } else {
2034                 if (d->bd_rfilter != NULL) {
2035                         fcode = __containerof((void *)d->bd_rfilter,
2036                             struct bpf_program_buffer, buffer);
2037 #ifdef BPF_JITTER
2038                         fcode->func = d->bd_bfilter;
2039 #endif
2040                 }
2041                 d->bd_rfilter = filter;
2042 #ifdef BPF_JITTER
2043                 d->bd_bfilter = jfunc;
2044 #endif
2045                 if (cmd == BIOCSETF)
2046                         reset_d(d);
2047
2048                 if (bpf_check_upgrade(cmd, d, filter, flen) != 0) {
2049                         /*
2050                          * Filter can be set several times without
2051                          * specifying interface. In this case just mark d
2052                          * as reader.
2053                          */
2054                         d->bd_writer = 0;
2055                         if (d->bd_bif != NULL) {
2056                                 /*
2057                                  * Remove descriptor from writers-only list
2058                                  * and add it to active readers list.
2059                                  */
2060                                 CK_LIST_REMOVE(d, bd_next);
2061                                 CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist,
2062                                     d, bd_next);
2063                                 CTR2(KTR_NET,
2064                                     "%s: upgrade required by pid %d",
2065                                     __func__, d->bd_pid);
2066                                 track_event = true;
2067                         }
2068                 }
2069         }
2070         BPFD_UNLOCK(d);
2071
2072         if (fcode != NULL)
2073                 NET_EPOCH_CALL(bpf_program_buffer_free, &fcode->epoch_ctx);
2074
2075         if (track_event)
2076                 EVENTHANDLER_INVOKE(bpf_track,
2077                     d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1);
2078
2079         BPF_UNLOCK();
2080         return (0);
2081 }
2082
2083 /*
2084  * Detach a file from its current interface (if attached at all) and attach
2085  * to the interface indicated by the name stored in ifr.
2086  * Return an errno or 0.
2087  */
2088 static int
2089 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
2090 {
2091         struct bpf_if *bp;
2092         struct ifnet *theywant;
2093
2094         BPF_LOCK_ASSERT();
2095
2096         theywant = ifunit(ifr->ifr_name);
2097         if (theywant == NULL || theywant->if_bpf == NULL)
2098                 return (ENXIO);
2099
2100         bp = theywant->if_bpf;
2101         /*
2102          * At this point, we expect the buffer is already allocated.  If not,
2103          * return an error.
2104          */
2105         switch (d->bd_bufmode) {
2106         case BPF_BUFMODE_BUFFER:
2107         case BPF_BUFMODE_ZBUF:
2108                 if (d->bd_sbuf == NULL)
2109                         return (EINVAL);
2110                 break;
2111
2112         default:
2113                 panic("bpf_setif: bufmode %d", d->bd_bufmode);
2114         }
2115         if (bp != d->bd_bif)
2116                 bpf_attachd(d, bp);
2117         else {
2118                 BPFD_LOCK(d);
2119                 reset_d(d);
2120                 BPFD_UNLOCK(d);
2121         }
2122         return (0);
2123 }
2124
2125 /*
2126  * Support for select() and poll() system calls
2127  *
2128  * Return true iff the specific operation will not block indefinitely.
2129  * Otherwise, return false but make a note that a selwakeup() must be done.
2130  */
2131 static int
2132 bpfpoll(struct cdev *dev, int events, struct thread *td)
2133 {
2134         struct bpf_d *d;
2135         int revents;
2136
2137         if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
2138                 return (events &
2139                     (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
2140
2141         /*
2142          * Refresh PID associated with this descriptor.
2143          */
2144         revents = events & (POLLOUT | POLLWRNORM);
2145         BPFD_LOCK(d);
2146         BPF_PID_REFRESH(d, td);
2147         if (events & (POLLIN | POLLRDNORM)) {
2148                 if (bpf_ready(d))
2149                         revents |= events & (POLLIN | POLLRDNORM);
2150                 else {
2151                         selrecord(td, &d->bd_sel);
2152                         /* Start the read timeout if necessary. */
2153                         if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2154                                 callout_reset(&d->bd_callout, d->bd_rtout,
2155                                     bpf_timed_out, d);
2156                                 d->bd_state = BPF_WAITING;
2157                         }
2158                 }
2159         }
2160         BPFD_UNLOCK(d);
2161         return (revents);
2162 }
2163
2164 /*
2165  * Support for kevent() system call.  Register EVFILT_READ filters and
2166  * reject all others.
2167  */
2168 int
2169 bpfkqfilter(struct cdev *dev, struct knote *kn)
2170 {
2171         struct bpf_d *d;
2172
2173         if (devfs_get_cdevpriv((void **)&d) != 0)
2174                 return (1);
2175
2176         switch (kn->kn_filter) {
2177         case EVFILT_READ:
2178                 kn->kn_fop = &bpfread_filtops;
2179                 break;
2180
2181         case EVFILT_WRITE:
2182                 kn->kn_fop = &bpfwrite_filtops;
2183                 break;
2184
2185         default:
2186                 return (1);
2187         }
2188
2189         /*
2190          * Refresh PID associated with this descriptor.
2191          */
2192         BPFD_LOCK(d);
2193         BPF_PID_REFRESH_CUR(d);
2194         kn->kn_hook = d;
2195         knlist_add(&d->bd_sel.si_note, kn, 1);
2196         BPFD_UNLOCK(d);
2197
2198         return (0);
2199 }
2200
2201 static void
2202 filt_bpfdetach(struct knote *kn)
2203 {
2204         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2205
2206         knlist_remove(&d->bd_sel.si_note, kn, 0);
2207 }
2208
2209 static int
2210 filt_bpfread(struct knote *kn, long hint)
2211 {
2212         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2213         int ready;
2214
2215         BPFD_LOCK_ASSERT(d);
2216         ready = bpf_ready(d);
2217         if (ready) {
2218                 kn->kn_data = d->bd_slen;
2219                 /*
2220                  * Ignore the hold buffer if it is being copied to user space.
2221                  */
2222                 if (!d->bd_hbuf_in_use && d->bd_hbuf)
2223                         kn->kn_data += d->bd_hlen;
2224         } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2225                 callout_reset(&d->bd_callout, d->bd_rtout,
2226                     bpf_timed_out, d);
2227                 d->bd_state = BPF_WAITING;
2228         }
2229
2230         return (ready);
2231 }
2232
2233 static int
2234 filt_bpfwrite(struct knote *kn, long hint)
2235 {
2236         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2237
2238         BPFD_LOCK_ASSERT(d);
2239
2240         if (d->bd_bif == NULL) {
2241                 kn->kn_data = 0;
2242                 return (0);
2243         } else {
2244                 kn->kn_data = d->bd_bif->bif_ifp->if_mtu;
2245                 return (1);
2246         }
2247 }
2248
2249 #define BPF_TSTAMP_NONE         0
2250 #define BPF_TSTAMP_FAST         1
2251 #define BPF_TSTAMP_NORMAL       2
2252 #define BPF_TSTAMP_EXTERN       3
2253
2254 static int
2255 bpf_ts_quality(int tstype)
2256 {
2257
2258         if (tstype == BPF_T_NONE)
2259                 return (BPF_TSTAMP_NONE);
2260         if ((tstype & BPF_T_FAST) != 0)
2261                 return (BPF_TSTAMP_FAST);
2262
2263         return (BPF_TSTAMP_NORMAL);
2264 }
2265
2266 static int
2267 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
2268 {
2269         struct timespec ts;
2270         struct m_tag *tag;
2271         int quality;
2272
2273         quality = bpf_ts_quality(tstype);
2274         if (quality == BPF_TSTAMP_NONE)
2275                 return (quality);
2276
2277         if (m != NULL) {
2278                 if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | M_TSTMP)) {
2279                         mbuf_tstmp2timespec(m, &ts);
2280                         timespec2bintime(&ts, bt);
2281                         return (BPF_TSTAMP_EXTERN);
2282                 }
2283                 tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
2284                 if (tag != NULL) {
2285                         *bt = *(struct bintime *)(tag + 1);
2286                         return (BPF_TSTAMP_EXTERN);
2287                 }
2288         }
2289         if (quality == BPF_TSTAMP_NORMAL)
2290                 binuptime(bt);
2291         else
2292                 getbinuptime(bt);
2293
2294         return (quality);
2295 }
2296
2297 /*
2298  * Incoming linkage from device drivers.  Process the packet pkt, of length
2299  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
2300  * by each process' filter, and if accepted, stashed into the corresponding
2301  * buffer.
2302  */
2303 void
2304 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2305 {
2306         struct epoch_tracker et;
2307         struct bintime bt;
2308         struct bpf_d *d;
2309 #ifdef BPF_JITTER
2310         bpf_jit_filter *bf;
2311 #endif
2312         u_int slen;
2313         int gottime;
2314
2315         gottime = BPF_TSTAMP_NONE;
2316         NET_EPOCH_ENTER(et);
2317         CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2318                 counter_u64_add(d->bd_rcount, 1);
2319                 /*
2320                  * NB: We dont call BPF_CHECK_DIRECTION() here since there
2321                  * is no way for the caller to indiciate to us whether this
2322                  * packet is inbound or outbound. In the bpf_mtap() routines,
2323                  * we use the interface pointers on the mbuf to figure it out.
2324                  */
2325 #ifdef BPF_JITTER
2326                 bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2327                 if (bf != NULL)
2328                         slen = (*(bf->func))(pkt, pktlen, pktlen);
2329                 else
2330 #endif
2331                 slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
2332                 if (slen != 0) {
2333                         /*
2334                          * Filter matches. Let's to acquire write lock.
2335                          */
2336                         BPFD_LOCK(d);
2337                         counter_u64_add(d->bd_fcount, 1);
2338                         if (gottime < bpf_ts_quality(d->bd_tstamp))
2339                                 gottime = bpf_gettime(&bt, d->bd_tstamp,
2340                                     NULL);
2341 #ifdef MAC
2342                         if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2343 #endif
2344                                 catchpacket(d, pkt, pktlen, slen,
2345                                     bpf_append_bytes, &bt);
2346                         BPFD_UNLOCK(d);
2347                 }
2348         }
2349         NET_EPOCH_EXIT(et);
2350 }
2351
2352 void
2353 bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen)
2354 {
2355         if (bpf_peers_present(ifp->if_bpf))
2356                 bpf_tap(ifp->if_bpf, pkt, pktlen);
2357 }
2358
2359 #define BPF_CHECK_DIRECTION(d, r, i)                            \
2360             (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||   \
2361             ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
2362
2363 /*
2364  * Incoming linkage from device drivers, when packet is in an mbuf chain.
2365  * Locking model is explained in bpf_tap().
2366  */
2367 void
2368 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2369 {
2370         struct epoch_tracker et;
2371         struct bintime bt;
2372         struct bpf_d *d;
2373 #ifdef BPF_JITTER
2374         bpf_jit_filter *bf;
2375 #endif
2376         u_int pktlen, slen;
2377         int gottime;
2378
2379         /* Skip outgoing duplicate packets. */
2380         if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) {
2381                 m->m_flags &= ~M_PROMISC;
2382                 return;
2383         }
2384
2385         pktlen = m_length(m, NULL);
2386         gottime = BPF_TSTAMP_NONE;
2387
2388         NET_EPOCH_ENTER(et);
2389         CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2390                 if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp))
2391                         continue;
2392                 counter_u64_add(d->bd_rcount, 1);
2393 #ifdef BPF_JITTER
2394                 bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2395                 /* XXX We cannot handle multiple mbufs. */
2396                 if (bf != NULL && m->m_next == NULL)
2397                         slen = (*(bf->func))(mtod(m, u_char *), pktlen,
2398                             pktlen);
2399                 else
2400 #endif
2401                 slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
2402                 if (slen != 0) {
2403                         BPFD_LOCK(d);
2404
2405                         counter_u64_add(d->bd_fcount, 1);
2406                         if (gottime < bpf_ts_quality(d->bd_tstamp))
2407                                 gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2408 #ifdef MAC
2409                         if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2410 #endif
2411                                 catchpacket(d, (u_char *)m, pktlen, slen,
2412                                     bpf_append_mbuf, &bt);
2413                         BPFD_UNLOCK(d);
2414                 }
2415         }
2416         NET_EPOCH_EXIT(et);
2417 }
2418
2419 void
2420 bpf_mtap_if(if_t ifp, struct mbuf *m)
2421 {
2422         if (bpf_peers_present(ifp->if_bpf)) {
2423                 M_ASSERTVALID(m);
2424                 bpf_mtap(ifp->if_bpf, m);
2425         }
2426 }
2427
2428 /*
2429  * Incoming linkage from device drivers, when packet is in
2430  * an mbuf chain and to be prepended by a contiguous header.
2431  */
2432 void
2433 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
2434 {
2435         struct epoch_tracker et;
2436         struct bintime bt;
2437         struct mbuf mb;
2438         struct bpf_d *d;
2439         u_int pktlen, slen;
2440         int gottime;
2441
2442         /* Skip outgoing duplicate packets. */
2443         if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
2444                 m->m_flags &= ~M_PROMISC;
2445                 return;
2446         }
2447
2448         pktlen = m_length(m, NULL);
2449         /*
2450          * Craft on-stack mbuf suitable for passing to bpf_filter.
2451          * Note that we cut corners here; we only setup what's
2452          * absolutely needed--this mbuf should never go anywhere else.
2453          */
2454         mb.m_flags = 0;
2455         mb.m_next = m;
2456         mb.m_data = data;
2457         mb.m_len = dlen;
2458         pktlen += dlen;
2459
2460         gottime = BPF_TSTAMP_NONE;
2461
2462         NET_EPOCH_ENTER(et);
2463         CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2464                 if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
2465                         continue;
2466                 counter_u64_add(d->bd_rcount, 1);
2467                 slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
2468                 if (slen != 0) {
2469                         BPFD_LOCK(d);
2470
2471                         counter_u64_add(d->bd_fcount, 1);
2472                         if (gottime < bpf_ts_quality(d->bd_tstamp))
2473                                 gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2474 #ifdef MAC
2475                         if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2476 #endif
2477                                 catchpacket(d, (u_char *)&mb, pktlen, slen,
2478                                     bpf_append_mbuf, &bt);
2479                         BPFD_UNLOCK(d);
2480                 }
2481         }
2482         NET_EPOCH_EXIT(et);
2483 }
2484
2485 void
2486 bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m)
2487 {
2488         if (bpf_peers_present(ifp->if_bpf)) {
2489                 M_ASSERTVALID(m);
2490                 bpf_mtap2(ifp->if_bpf, data, dlen, m);
2491         }
2492 }
2493
2494 #undef  BPF_CHECK_DIRECTION
2495 #undef  BPF_TSTAMP_NONE
2496 #undef  BPF_TSTAMP_FAST
2497 #undef  BPF_TSTAMP_NORMAL
2498 #undef  BPF_TSTAMP_EXTERN
2499
2500 static int
2501 bpf_hdrlen(struct bpf_d *d)
2502 {
2503         int hdrlen;
2504
2505         hdrlen = d->bd_bif->bif_hdrlen;
2506 #ifndef BURN_BRIDGES
2507         if (d->bd_tstamp == BPF_T_NONE ||
2508             BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
2509 #ifdef COMPAT_FREEBSD32
2510                 if (d->bd_compat32)
2511                         hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
2512                 else
2513 #endif
2514                         hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
2515         else
2516 #endif
2517                 hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
2518 #ifdef COMPAT_FREEBSD32
2519         if (d->bd_compat32)
2520                 hdrlen = BPF_WORDALIGN32(hdrlen);
2521         else
2522 #endif
2523                 hdrlen = BPF_WORDALIGN(hdrlen);
2524
2525         return (hdrlen - d->bd_bif->bif_hdrlen);
2526 }
2527
2528 static void
2529 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
2530 {
2531         struct bintime bt2, boottimebin;
2532         struct timeval tsm;
2533         struct timespec tsn;
2534
2535         if ((tstype & BPF_T_MONOTONIC) == 0) {
2536                 bt2 = *bt;
2537                 getboottimebin(&boottimebin);
2538                 bintime_add(&bt2, &boottimebin);
2539                 bt = &bt2;
2540         }
2541         switch (BPF_T_FORMAT(tstype)) {
2542         case BPF_T_MICROTIME:
2543                 bintime2timeval(bt, &tsm);
2544                 ts->bt_sec = tsm.tv_sec;
2545                 ts->bt_frac = tsm.tv_usec;
2546                 break;
2547         case BPF_T_NANOTIME:
2548                 bintime2timespec(bt, &tsn);
2549                 ts->bt_sec = tsn.tv_sec;
2550                 ts->bt_frac = tsn.tv_nsec;
2551                 break;
2552         case BPF_T_BINTIME:
2553                 ts->bt_sec = bt->sec;
2554                 ts->bt_frac = bt->frac;
2555                 break;
2556         }
2557 }
2558
2559 /*
2560  * Move the packet data from interface memory (pkt) into the
2561  * store buffer.  "cpfn" is the routine called to do the actual data
2562  * transfer.  bcopy is passed in to copy contiguous chunks, while
2563  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
2564  * pkt is really an mbuf.
2565  */
2566 static void
2567 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
2568     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
2569     struct bintime *bt)
2570 {
2571         static char zeroes[BPF_ALIGNMENT];
2572         struct bpf_xhdr hdr;
2573 #ifndef BURN_BRIDGES
2574         struct bpf_hdr hdr_old;
2575 #ifdef COMPAT_FREEBSD32
2576         struct bpf_hdr32 hdr32_old;
2577 #endif
2578 #endif
2579         int caplen, curlen, hdrlen, pad, totlen;
2580         int do_wakeup = 0;
2581         int do_timestamp;
2582         int tstype;
2583
2584         BPFD_LOCK_ASSERT(d);
2585         if (d->bd_bif == NULL) {
2586                 /* Descriptor was detached in concurrent thread */
2587                 counter_u64_add(d->bd_dcount, 1);
2588                 return;
2589         }
2590
2591         /*
2592          * Detect whether user space has released a buffer back to us, and if
2593          * so, move it from being a hold buffer to a free buffer.  This may
2594          * not be the best place to do it (for example, we might only want to
2595          * run this check if we need the space), but for now it's a reliable
2596          * spot to do it.
2597          */
2598         if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
2599                 d->bd_fbuf = d->bd_hbuf;
2600                 d->bd_hbuf = NULL;
2601                 d->bd_hlen = 0;
2602                 bpf_buf_reclaimed(d);
2603         }
2604
2605         /*
2606          * Figure out how many bytes to move.  If the packet is
2607          * greater or equal to the snapshot length, transfer that
2608          * much.  Otherwise, transfer the whole packet (unless
2609          * we hit the buffer size limit).
2610          */
2611         hdrlen = bpf_hdrlen(d);
2612         totlen = hdrlen + min(snaplen, pktlen);
2613         if (totlen > d->bd_bufsize)
2614                 totlen = d->bd_bufsize;
2615
2616         /*
2617          * Round up the end of the previous packet to the next longword.
2618          *
2619          * Drop the packet if there's no room and no hope of room
2620          * If the packet would overflow the storage buffer or the storage
2621          * buffer is considered immutable by the buffer model, try to rotate
2622          * the buffer and wakeup pending processes.
2623          */
2624 #ifdef COMPAT_FREEBSD32
2625         if (d->bd_compat32)
2626                 curlen = BPF_WORDALIGN32(d->bd_slen);
2627         else
2628 #endif
2629                 curlen = BPF_WORDALIGN(d->bd_slen);
2630         if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
2631                 if (d->bd_fbuf == NULL) {
2632                         /*
2633                          * There's no room in the store buffer, and no
2634                          * prospect of room, so drop the packet.  Notify the
2635                          * buffer model.
2636                          */
2637                         bpf_buffull(d);
2638                         counter_u64_add(d->bd_dcount, 1);
2639                         return;
2640                 }
2641                 KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use"));
2642                 ROTATE_BUFFERS(d);
2643                 do_wakeup = 1;
2644                 curlen = 0;
2645         } else {
2646                 if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT) {
2647                         /*
2648                          * Immediate mode is set, or the read timeout has
2649                          * already expired during a select call.  A packet
2650                          * arrived, so the reader should be woken up.
2651                          */
2652                         do_wakeup = 1;
2653                 }
2654                 pad = curlen - d->bd_slen;
2655                 KASSERT(pad >= 0 && pad <= sizeof(zeroes),
2656                     ("%s: invalid pad byte count %d", __func__, pad));
2657                 if (pad > 0) {
2658                         /* Zero pad bytes. */
2659                         bpf_append_bytes(d, d->bd_sbuf, d->bd_slen, zeroes,
2660                             pad);
2661                 }
2662         }
2663
2664         caplen = totlen - hdrlen;
2665         tstype = d->bd_tstamp;
2666         do_timestamp = tstype != BPF_T_NONE;
2667 #ifndef BURN_BRIDGES
2668         if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
2669                 struct bpf_ts ts;
2670                 if (do_timestamp)
2671                         bpf_bintime2ts(bt, &ts, tstype);
2672 #ifdef COMPAT_FREEBSD32
2673                 if (d->bd_compat32) {
2674                         bzero(&hdr32_old, sizeof(hdr32_old));
2675                         if (do_timestamp) {
2676                                 hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
2677                                 hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
2678                         }
2679                         hdr32_old.bh_datalen = pktlen;
2680                         hdr32_old.bh_hdrlen = hdrlen;
2681                         hdr32_old.bh_caplen = caplen;
2682                         bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
2683                             sizeof(hdr32_old));
2684                         goto copy;
2685                 }
2686 #endif
2687                 bzero(&hdr_old, sizeof(hdr_old));
2688                 if (do_timestamp) {
2689                         hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
2690                         hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
2691                 }
2692                 hdr_old.bh_datalen = pktlen;
2693                 hdr_old.bh_hdrlen = hdrlen;
2694                 hdr_old.bh_caplen = caplen;
2695                 bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
2696                     sizeof(hdr_old));
2697                 goto copy;
2698         }
2699 #endif
2700
2701         /*
2702          * Append the bpf header.  Note we append the actual header size, but
2703          * move forward the length of the header plus padding.
2704          */
2705         bzero(&hdr, sizeof(hdr));
2706         if (do_timestamp)
2707                 bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
2708         hdr.bh_datalen = pktlen;
2709         hdr.bh_hdrlen = hdrlen;
2710         hdr.bh_caplen = caplen;
2711         bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
2712
2713         /*
2714          * Copy the packet data into the store buffer and update its length.
2715          */
2716 #ifndef BURN_BRIDGES
2717 copy:
2718 #endif
2719         (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
2720         d->bd_slen = curlen + totlen;
2721
2722         if (do_wakeup)
2723                 bpf_wakeup(d);
2724 }
2725
2726 /*
2727  * Free buffers currently in use by a descriptor.
2728  * Called on close.
2729  */
2730 static void
2731 bpfd_free(epoch_context_t ctx)
2732 {
2733         struct bpf_d *d;
2734         struct bpf_program_buffer *p;
2735
2736         /*
2737          * We don't need to lock out interrupts since this descriptor has
2738          * been detached from its interface and it yet hasn't been marked
2739          * free.
2740          */
2741         d = __containerof(ctx, struct bpf_d, epoch_ctx);
2742         bpf_free(d);
2743         if (d->bd_rfilter != NULL) {
2744                 p = __containerof((void *)d->bd_rfilter,
2745                     struct bpf_program_buffer, buffer);
2746 #ifdef BPF_JITTER
2747                 p->func = d->bd_bfilter;
2748 #endif
2749                 bpf_program_buffer_free(&p->epoch_ctx);
2750         }
2751         if (d->bd_wfilter != NULL) {
2752                 p = __containerof((void *)d->bd_wfilter,
2753                     struct bpf_program_buffer, buffer);
2754 #ifdef BPF_JITTER
2755                 p->func = NULL;
2756 #endif
2757                 bpf_program_buffer_free(&p->epoch_ctx);
2758         }
2759
2760         mtx_destroy(&d->bd_lock);
2761         counter_u64_free(d->bd_rcount);
2762         counter_u64_free(d->bd_dcount);
2763         counter_u64_free(d->bd_fcount);
2764         counter_u64_free(d->bd_wcount);
2765         counter_u64_free(d->bd_wfcount);
2766         counter_u64_free(d->bd_wdcount);
2767         counter_u64_free(d->bd_zcopy);
2768         free(d, M_BPF);
2769 }
2770
2771 /*
2772  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
2773  * fixed size of the link header (variable length headers not yet supported).
2774  */
2775 void
2776 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
2777 {
2778
2779         bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
2780 }
2781
2782 /*
2783  * Attach an interface to bpf.  ifp is a pointer to the structure
2784  * defining the interface to be attached, dlt is the link layer type,
2785  * and hdrlen is the fixed size of the link header (variable length
2786  * headers are not yet supporrted).
2787  */
2788 void
2789 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen,
2790     struct bpf_if **driverp)
2791 {
2792         struct bpf_if *bp;
2793
2794         KASSERT(*driverp == NULL,
2795             ("bpfattach2: driverp already initialized"));
2796
2797         bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO);
2798
2799         CK_LIST_INIT(&bp->bif_dlist);
2800         CK_LIST_INIT(&bp->bif_wlist);
2801         bp->bif_ifp = ifp;
2802         bp->bif_dlt = dlt;
2803         bp->bif_hdrlen = hdrlen;
2804         bp->bif_bpf = driverp;
2805         refcount_init(&bp->bif_refcnt, 1);
2806         *driverp = bp;
2807         /*
2808          * Reference ifnet pointer, so it won't freed until
2809          * we release it.
2810          */
2811         if_ref(ifp);
2812         BPF_LOCK();
2813         CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
2814         BPF_UNLOCK();
2815
2816         if (bootverbose && IS_DEFAULT_VNET(curvnet))
2817                 if_printf(ifp, "bpf attached\n");
2818 }
2819
2820 #ifdef VIMAGE
2821 /*
2822  * When moving interfaces between vnet instances we need a way to
2823  * query the dlt and hdrlen before detach so we can re-attch the if_bpf
2824  * after the vmove.  We unfortunately have no device driver infrastructure
2825  * to query the interface for these values after creation/attach, thus
2826  * add this as a workaround.
2827  */
2828 int
2829 bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen)
2830 {
2831
2832         if (bp == NULL)
2833                 return (ENXIO);
2834         if (bif_dlt == NULL && bif_hdrlen == NULL)
2835                 return (0);
2836
2837         if (bif_dlt != NULL)
2838                 *bif_dlt = bp->bif_dlt;
2839         if (bif_hdrlen != NULL)
2840                 *bif_hdrlen = bp->bif_hdrlen;
2841
2842         return (0);
2843 }
2844 #endif
2845
2846 /*
2847  * Detach bpf from an interface. This involves detaching each descriptor
2848  * associated with the interface. Notify each descriptor as it's detached
2849  * so that any sleepers wake up and get ENXIO.
2850  */
2851 void
2852 bpfdetach(struct ifnet *ifp)
2853 {
2854         struct bpf_if *bp, *bp_temp;
2855         struct bpf_d *d;
2856
2857         BPF_LOCK();
2858         /* Find all bpf_if struct's which reference ifp and detach them. */
2859         CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) {
2860                 if (ifp != bp->bif_ifp)
2861                         continue;
2862
2863                 CK_LIST_REMOVE(bp, bif_next);
2864                 *bp->bif_bpf = (struct bpf_if *)&dead_bpf_if;
2865
2866                 CTR4(KTR_NET,
2867                     "%s: sheduling free for encap %d (%p) for if %p",
2868                     __func__, bp->bif_dlt, bp, ifp);
2869
2870                 /* Detach common descriptors */
2871                 while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) {
2872                         bpf_detachd_locked(d, true);
2873                 }
2874
2875                 /* Detach writer-only descriptors */
2876                 while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) {
2877                         bpf_detachd_locked(d, true);
2878                 }
2879                 bpfif_rele(bp);
2880         }
2881         BPF_UNLOCK();
2882 }
2883
2884 /*
2885  * Get a list of available data link type of the interface.
2886  */
2887 static int
2888 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
2889 {
2890         struct ifnet *ifp;
2891         struct bpf_if *bp;
2892         u_int *lst;
2893         int error, n, n1;
2894
2895         BPF_LOCK_ASSERT();
2896
2897         ifp = d->bd_bif->bif_ifp;
2898         n1 = 0;
2899         CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2900                 if (bp->bif_ifp == ifp)
2901                         n1++;
2902         }
2903         if (bfl->bfl_list == NULL) {
2904                 bfl->bfl_len = n1;
2905                 return (0);
2906         }
2907         if (n1 > bfl->bfl_len)
2908                 return (ENOMEM);
2909
2910         lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK);
2911         n = 0;
2912         CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2913                 if (bp->bif_ifp != ifp)
2914                         continue;
2915                 lst[n++] = bp->bif_dlt;
2916         }
2917         error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n);
2918         free(lst, M_TEMP);
2919         bfl->bfl_len = n;
2920         return (error);
2921 }
2922
2923 /*
2924  * Set the data link type of a BPF instance.
2925  */
2926 static int
2927 bpf_setdlt(struct bpf_d *d, u_int dlt)
2928 {
2929         int error, opromisc;
2930         struct ifnet *ifp;
2931         struct bpf_if *bp;
2932
2933         BPF_LOCK_ASSERT();
2934         MPASS(d->bd_bif != NULL);
2935
2936         /*
2937          * It is safe to check bd_bif without BPFD_LOCK, it can not be
2938          * changed while we hold global lock.
2939          */
2940         if (d->bd_bif->bif_dlt == dlt)
2941                 return (0);
2942
2943         ifp = d->bd_bif->bif_ifp;
2944         CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2945                 if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
2946                         break;
2947         }
2948         if (bp == NULL)
2949                 return (EINVAL);
2950
2951         opromisc = d->bd_promisc;
2952         bpf_attachd(d, bp);
2953         if (opromisc) {
2954                 error = ifpromisc(bp->bif_ifp, 1);
2955                 if (error)
2956                         if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n",
2957                             __func__, error);
2958                 else
2959                         d->bd_promisc = 1;
2960         }
2961         return (0);
2962 }
2963
2964 static void
2965 bpf_drvinit(void *unused)
2966 {
2967         struct cdev *dev;
2968
2969         sx_init(&bpf_sx, "bpf global lock");
2970         CK_LIST_INIT(&bpf_iflist);
2971
2972         dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
2973         /* For compatibility */
2974         make_dev_alias(dev, "bpf0");
2975 }
2976
2977 /*
2978  * Zero out the various packet counters associated with all of the bpf
2979  * descriptors.  At some point, we will probably want to get a bit more
2980  * granular and allow the user to specify descriptors to be zeroed.
2981  */
2982 static void
2983 bpf_zero_counters(void)
2984 {
2985         struct bpf_if *bp;
2986         struct bpf_d *bd;
2987
2988         BPF_LOCK();
2989         /*
2990          * We are protected by global lock here, interfaces and
2991          * descriptors can not be deleted while we hold it.
2992          */
2993         CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2994                 CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2995                         counter_u64_zero(bd->bd_rcount);
2996                         counter_u64_zero(bd->bd_dcount);
2997                         counter_u64_zero(bd->bd_fcount);
2998                         counter_u64_zero(bd->bd_wcount);
2999                         counter_u64_zero(bd->bd_wfcount);
3000                         counter_u64_zero(bd->bd_zcopy);
3001                 }
3002         }
3003         BPF_UNLOCK();
3004 }
3005
3006 /*
3007  * Fill filter statistics
3008  */
3009 static void
3010 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
3011 {
3012
3013         BPF_LOCK_ASSERT();
3014         bzero(d, sizeof(*d));
3015         d->bd_structsize = sizeof(*d);
3016         d->bd_immediate = bd->bd_immediate;
3017         d->bd_promisc = bd->bd_promisc;
3018         d->bd_hdrcmplt = bd->bd_hdrcmplt;
3019         d->bd_direction = bd->bd_direction;
3020         d->bd_feedback = bd->bd_feedback;
3021         d->bd_async = bd->bd_async;
3022         d->bd_rcount = counter_u64_fetch(bd->bd_rcount);
3023         d->bd_dcount = counter_u64_fetch(bd->bd_dcount);
3024         d->bd_fcount = counter_u64_fetch(bd->bd_fcount);
3025         d->bd_sig = bd->bd_sig;
3026         d->bd_slen = bd->bd_slen;
3027         d->bd_hlen = bd->bd_hlen;
3028         d->bd_bufsize = bd->bd_bufsize;
3029         d->bd_pid = bd->bd_pid;
3030         strlcpy(d->bd_ifname,
3031             bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
3032         d->bd_locked = bd->bd_locked;
3033         d->bd_wcount = counter_u64_fetch(bd->bd_wcount);
3034         d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount);
3035         d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount);
3036         d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy);
3037         d->bd_bufmode = bd->bd_bufmode;
3038 }
3039
3040 /*
3041  * Handle `netstat -B' stats request
3042  */
3043 static int
3044 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
3045 {
3046         static const struct xbpf_d zerostats;
3047         struct xbpf_d *xbdbuf, *xbd, tempstats;
3048         int index, error;
3049         struct bpf_if *bp;
3050         struct bpf_d *bd;
3051
3052         /*
3053          * XXX This is not technically correct. It is possible for non
3054          * privileged users to open bpf devices. It would make sense
3055          * if the users who opened the devices were able to retrieve
3056          * the statistics for them, too.
3057          */
3058         error = priv_check(req->td, PRIV_NET_BPF);
3059         if (error)
3060                 return (error);
3061         /*
3062          * Check to see if the user is requesting that the counters be
3063          * zeroed out.  Explicitly check that the supplied data is zeroed,
3064          * as we aren't allowing the user to set the counters currently.
3065          */
3066         if (req->newptr != NULL) {
3067                 if (req->newlen != sizeof(tempstats))
3068                         return (EINVAL);
3069                 memset(&tempstats, 0, sizeof(tempstats));
3070                 error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
3071                 if (error)
3072                         return (error);
3073                 if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
3074                         return (EINVAL);
3075                 bpf_zero_counters();
3076                 return (0);
3077         }
3078         if (req->oldptr == NULL)
3079                 return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
3080         if (bpf_bpfd_cnt == 0)
3081                 return (SYSCTL_OUT(req, 0, 0));
3082         xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
3083         BPF_LOCK();
3084         if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
3085                 BPF_UNLOCK();
3086                 free(xbdbuf, M_BPF);
3087                 return (ENOMEM);
3088         }
3089         index = 0;
3090         CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
3091                 /* Send writers-only first */
3092                 CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
3093                         xbd = &xbdbuf[index++];
3094                         bpfstats_fill_xbpf(xbd, bd);
3095                 }
3096                 CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
3097                         xbd = &xbdbuf[index++];
3098                         bpfstats_fill_xbpf(xbd, bd);
3099                 }
3100         }
3101         BPF_UNLOCK();
3102         error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
3103         free(xbdbuf, M_BPF);
3104         return (error);
3105 }
3106
3107 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
3108
3109 #else /* !DEV_BPF && !NETGRAPH_BPF */
3110
3111 /*
3112  * NOP stubs to allow bpf-using drivers to load and function.
3113  *
3114  * A 'better' implementation would allow the core bpf functionality
3115  * to be loaded at runtime.
3116  */
3117
3118 void
3119 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
3120 {
3121 }
3122
3123 void
3124 bpf_tap_if(if_t ifp, u_char *pkt, u_int pktlen)
3125 {
3126 }
3127
3128 void
3129 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
3130 {
3131 }
3132
3133 void
3134 bpf_mtap_if(if_t ifp, struct mbuf *m)
3135 {
3136 }
3137
3138 void
3139 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
3140 {
3141 }
3142
3143 void
3144 bpf_mtap2_if(if_t ifp, void *data, u_int dlen, struct mbuf *m)
3145 {
3146 }
3147
3148 void
3149 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
3150 {
3151
3152         bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
3153 }
3154
3155 void
3156 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
3157 {
3158
3159         *driverp = (struct bpf_if *)&dead_bpf_if;
3160 }
3161
3162 void
3163 bpfdetach(struct ifnet *ifp)
3164 {
3165 }
3166
3167 u_int
3168 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
3169 {
3170         return -1;      /* "no filter" behaviour */
3171 }
3172
3173 int
3174 bpf_validate(const struct bpf_insn *f, int len)
3175 {
3176         return 0;               /* false */
3177 }
3178
3179 #endif /* !DEV_BPF && !NETGRAPH_BPF */
3180
3181 #ifdef DDB
3182 static void
3183 bpf_show_bpf_if(struct bpf_if *bpf_if)
3184 {
3185
3186         if (bpf_if == NULL)
3187                 return;
3188         db_printf("%p:\n", bpf_if);
3189 #define BPF_DB_PRINTF(f, e)     db_printf("   %s = " f "\n", #e, bpf_if->e);
3190 #define BPF_DB_PRINTF_RAW(f, e) db_printf("   %s = " f "\n", #e, e);
3191         /* bif_ext.bif_next */
3192         /* bif_ext.bif_dlist */
3193         BPF_DB_PRINTF("%#x", bif_dlt);
3194         BPF_DB_PRINTF("%u", bif_hdrlen);
3195         /* bif_wlist */
3196         BPF_DB_PRINTF("%p", bif_ifp);
3197         BPF_DB_PRINTF("%p", bif_bpf);
3198         BPF_DB_PRINTF_RAW("%u", refcount_load(&bpf_if->bif_refcnt));
3199 }
3200
3201 DB_SHOW_COMMAND(bpf_if, db_show_bpf_if)
3202 {
3203
3204         if (!have_addr) {
3205                 db_printf("usage: show bpf_if <struct bpf_if *>\n");
3206                 return;
3207         }
3208
3209         bpf_show_bpf_if((struct bpf_if *)addr);
3210 }
3211 #endif