sys/net/bpf.c

   1 /*-
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 1990, 1991, 1993
   5  *      The Regents of the University of California.  All rights reserved.
   6  * Copyright (c) 2019 Andrey V. Elsukov <ae@FreeBSD.org>
   7  *
   8  * This code is derived from the Stanford/CMU enet packet filter,
   9  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
  10  * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
  11  * Berkeley Laboratory.
  12  *
  13  * Redistribution and use in source and binary forms, with or without
  14  * modification, are permitted provided that the following conditions
  15  * are met:
  16  * 1. Redistributions of source code must retain the above copyright
  17  *    notice, this list of conditions and the following disclaimer.
  18  * 2. Redistributions in binary form must reproduce the above copyright
  19  *    notice, this list of conditions and the following disclaimer in the
  20  *    documentation and/or other materials provided with the distribution.
  21  * 3. Neither the name of the University nor the names of its contributors
  22  *    may be used to endorse or promote products derived from this software
  23  *    without specific prior written permission.
  24  *
  25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35  * SUCH DAMAGE.
  36  *
  37  *      @(#)bpf.c       8.4 (Berkeley) 1/9/95
  38  */
  39
  40 #include <sys/cdefs.h>
  41 __FBSDID("$FreeBSD$");
  42
  43 #include "opt_bpf.h"
  44 #include "opt_ddb.h"
  45 #include "opt_netgraph.h"
  46
  47 #include <sys/param.h>
  48 #include <sys/conf.h>
  49 #include <sys/eventhandler.h>
  50 #include <sys/fcntl.h>
  51 #include <sys/jail.h>
  52 #include <sys/ktr.h>
  53 #include <sys/lock.h>
  54 #include <sys/malloc.h>
  55 #include <sys/mbuf.h>
  56 #include <sys/mutex.h>
  57 #include <sys/time.h>
  58 #include <sys/priv.h>
  59 #include <sys/proc.h>
  60 #include <sys/signalvar.h>
  61 #include <sys/filio.h>
  62 #include <sys/sockio.h>
  63 #include <sys/ttycom.h>
  64 #include <sys/uio.h>
  65 #include <sys/sysent.h>
  66 #include <sys/systm.h>
  67
  68 #include <sys/event.h>
  69 #include <sys/file.h>
  70 #include <sys/poll.h>
  71 #include <sys/proc.h>
  72
  73 #include <sys/socket.h>
  74
  75 #ifdef DDB
  76 #include <ddb/ddb.h>
  77 #endif
  78
  79 #include <net/if.h>
  80 #include <net/if_var.h>
  81 #include <net/if_vlan_var.h>
  82 #include <net/if_dl.h>
  83 #include <net/bpf.h>
  84 #include <net/bpf_buffer.h>
  85 #ifdef BPF_JITTER
  86 #include <net/bpf_jitter.h>
  87 #endif
  88 #include <net/bpf_zerocopy.h>
  89 #include <net/bpfdesc.h>
  90 #include <net/route.h>
  91 #include <net/vnet.h>
  92
  93 #include <netinet/in.h>
  94 #include <netinet/if_ether.h>
  95 #include <sys/kernel.h>
  96 #include <sys/sysctl.h>
  97
  98 #include <net80211/ieee80211_freebsd.h>
  99
 100 #include <security/mac/mac_framework.h>
 101
 102 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 103
 104 static struct bpf_if_ext dead_bpf_if = {
 105         .bif_dlist = CK_LIST_HEAD_INITIALIZER()
 106 };
 107
 108 struct bpf_if {
 109 #define bif_next        bif_ext.bif_next
 110 #define bif_dlist       bif_ext.bif_dlist
 111         struct bpf_if_ext bif_ext;      /* public members */
 112         u_int           bif_dlt;        /* link layer type */
 113         u_int           bif_hdrlen;     /* length of link header */
 114         struct bpfd_list bif_wlist;     /* writer-only list */
 115         struct ifnet    *bif_ifp;       /* corresponding interface */
 116         struct bpf_if   **bif_bpf;      /* Pointer to pointer to us */
 117         volatile u_int  bif_refcnt;
 118         struct epoch_context epoch_ctx;
 119 };
 120
 121 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
 122
 123 struct bpf_program_buffer {
 124         struct epoch_context    epoch_ctx;
 125 #ifdef BPF_JITTER
 126         bpf_jit_filter          *func;
 127 #endif
 128         void                    *buffer[0];
 129 };
 130
 131 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 132
 133 #define PRINET  26                      /* interruptible */
 134 #define BPF_PRIO_MAX    7
 135
 136 #define SIZEOF_BPF_HDR(type)    \
 137     (offsetof(type, bh_hdrlen) + sizeof(((type *)0)->bh_hdrlen))
 138
 139 #ifdef COMPAT_FREEBSD32
 140 #include <sys/mount.h>
 141 #include <compat/freebsd32/freebsd32.h>
 142 #define BPF_ALIGNMENT32 sizeof(int32_t)
 143 #define BPF_WORDALIGN32(x) roundup2(x, BPF_ALIGNMENT32)
 144
 145 #ifndef BURN_BRIDGES
 146 /*
 147  * 32-bit version of structure prepended to each packet.  We use this header
 148  * instead of the standard one for 32-bit streams.  We mark the a stream as
 149  * 32-bit the first time we see a 32-bit compat ioctl request.
 150  */
 151 struct bpf_hdr32 {
 152         struct timeval32 bh_tstamp;     /* time stamp */
 153         uint32_t        bh_caplen;      /* length of captured portion */
 154         uint32_t        bh_datalen;     /* original length of packet */
 155         uint16_t        bh_hdrlen;      /* length of bpf header (this struct
 156                                            plus alignment padding) */
 157 };
 158 #endif
 159
 160 struct bpf_program32 {
 161         u_int bf_len;
 162         uint32_t bf_insns;
 163 };
 164
 165 struct bpf_dltlist32 {
 166         u_int   bfl_len;
 167         u_int   bfl_list;
 168 };
 169
 170 #define BIOCSETF32      _IOW('B', 103, struct bpf_program32)
 171 #define BIOCSRTIMEOUT32 _IOW('B', 109, struct timeval32)
 172 #define BIOCGRTIMEOUT32 _IOR('B', 110, struct timeval32)
 173 #define BIOCGDLTLIST32  _IOWR('B', 121, struct bpf_dltlist32)
 174 #define BIOCSETWF32     _IOW('B', 123, struct bpf_program32)
 175 #define BIOCSETFNR32    _IOW('B', 130, struct bpf_program32)
 176 #endif
 177
 178 #define BPF_LOCK()         sx_xlock(&bpf_sx)
 179 #define BPF_UNLOCK()            sx_xunlock(&bpf_sx)
 180 #define BPF_LOCK_ASSERT()       sx_assert(&bpf_sx, SA_XLOCKED)
 181 /*
 182  * bpf_iflist is a list of BPF interface structures, each corresponding to a
 183  * specific DLT. The same network interface might have several BPF interface
 184  * structures registered by different layers in the stack (i.e., 802.11
 185  * frames, ethernet frames, etc).
 186  */
 187 CK_LIST_HEAD(bpf_iflist, bpf_if);
 188 static struct bpf_iflist bpf_iflist;
 189 static struct sx        bpf_sx;         /* bpf global lock */
 190 static int              bpf_bpfd_cnt;
 191
 192 static void     bpfif_ref(struct bpf_if *);
 193 static void     bpfif_rele(struct bpf_if *);
 194
 195 static void     bpfd_ref(struct bpf_d *);
 196 static void     bpfd_rele(struct bpf_d *);
 197 static void     bpf_attachd(struct bpf_d *, struct bpf_if *);
 198 static void     bpf_detachd(struct bpf_d *);
 199 static void     bpf_detachd_locked(struct bpf_d *, bool);
 200 static void     bpfd_free(epoch_context_t);
 201 static int      bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 202                     struct sockaddr *, int *, struct bpf_d *);
 203 static int      bpf_setif(struct bpf_d *, struct ifreq *);
 204 static void     bpf_timed_out(void *);
 205 static __inline void
 206                 bpf_wakeup(struct bpf_d *);
 207 static void     catchpacket(struct bpf_d *, u_char *, u_int, u_int,
 208                     void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 209                     struct bintime *);
 210 static void     reset_d(struct bpf_d *);
 211 static int      bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
 212 static int      bpf_getdltlist(struct bpf_d *, struct bpf_dltlist *);
 213 static int      bpf_setdlt(struct bpf_d *, u_int);
 214 static void     filt_bpfdetach(struct knote *);
 215 static int      filt_bpfread(struct knote *, long);
 216 static int      filt_bpfwrite(struct knote *, long);
 217 static void     bpf_drvinit(void *);
 218 static int      bpf_stats_sysctl(SYSCTL_HANDLER_ARGS);
 219
 220 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
 221     "bpf sysctl");
 222 int bpf_maxinsns = BPF_MAXINSNS;
 223 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
 224     &bpf_maxinsns, 0, "Maximum bpf program instructions");
 225 static int bpf_zerocopy_enable = 0;
 226 SYSCTL_INT(_net_bpf, OID_AUTO, zerocopy_enable, CTLFLAG_RW,
 227     &bpf_zerocopy_enable, 0, "Enable new zero-copy BPF buffer sessions");
 228 static SYSCTL_NODE(_net_bpf, OID_AUTO, stats, CTLFLAG_MPSAFE | CTLFLAG_RW,
 229     bpf_stats_sysctl, "bpf statistics portal");
 230
 231 VNET_DEFINE_STATIC(int, bpf_optimize_writers) = 0;
 232 #define V_bpf_optimize_writers VNET(bpf_optimize_writers)
 233 SYSCTL_INT(_net_bpf, OID_AUTO, optimize_writers, CTLFLAG_VNET | CTLFLAG_RWTUN,
 234     &VNET_NAME(bpf_optimize_writers), 0,
 235     "Do not send packets until BPF program is set");
 236
 237 static  d_open_t        bpfopen;
 238 static  d_read_t        bpfread;
 239 static  d_write_t       bpfwrite;
 240 static  d_ioctl_t       bpfioctl;
 241 static  d_poll_t        bpfpoll;
 242 static  d_kqfilter_t    bpfkqfilter;
 243
 244 static struct cdevsw bpf_cdevsw = {
 245         .d_version =    D_VERSION,
 246         .d_open =       bpfopen,
 247         .d_read =       bpfread,
 248         .d_write =      bpfwrite,
 249         .d_ioctl =      bpfioctl,
 250         .d_poll =       bpfpoll,
 251         .d_name =       "bpf",
 252         .d_kqfilter =   bpfkqfilter,
 253 };
 254
 255 static struct filterops bpfread_filtops = {
 256         .f_isfd = 1,
 257         .f_detach = filt_bpfdetach,
 258         .f_event = filt_bpfread,
 259 };
 260
 261 static struct filterops bpfwrite_filtops = {
 262         .f_isfd = 1,
 263         .f_detach = filt_bpfdetach,
 264         .f_event = filt_bpfwrite,
 265 };
 266
 267 /*
 268  * LOCKING MODEL USED BY BPF
 269  *
 270  * Locks:
 271  * 1) global lock (BPF_LOCK). Sx, used to protect some global counters,
 272  * every bpf_iflist changes, serializes ioctl access to bpf descriptors.
 273  * 2) Descriptor lock. Mutex, used to protect BPF buffers and various
 274  * structure fields used by bpf_*tap* code.
 275  *
 276  * Lock order: global lock, then descriptor lock.
 277  *
 278  * There are several possible consumers:
 279  *
 280  * 1. The kernel registers interface pointer with bpfattach().
 281  * Each call allocates new bpf_if structure, references ifnet pointer
 282  * and links bpf_if into bpf_iflist chain. This is protected with global
 283  * lock.
 284  *
 285  * 2. An userland application uses ioctl() call to bpf_d descriptor.
 286  * All such call are serialized with global lock. BPF filters can be
 287  * changed, but pointer to old filter will be freed using NET_EPOCH_CALL().
 288  * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to
 289  * filter pointers, even if change will happen during bpf_tap execution.
 290  * Destroying of bpf_d descriptor also is doing using NET_EPOCH_CALL().
 291  *
 292  * 3. An userland application can write packets into bpf_d descriptor.
 293  * There we need to be sure, that ifnet won't disappear during bpfwrite().
 294  *
 295  * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to
 296  * bif_dlist is protected with net_epoch_preempt section. So, it should
 297  * be safe to make access to bpf_d descriptor inside the section.
 298  *
 299  * 5. The kernel invokes bpfdetach() on interface destroying. All lists
 300  * are modified with global lock held and actual free() is done using
 301  * NET_EPOCH_CALL().
 302  */
 303
 304 static void
 305 bpfif_free(epoch_context_t ctx)
 306 {
 307         struct bpf_if *bp;
 308
 309         bp = __containerof(ctx, struct bpf_if, epoch_ctx);
 310         if_rele(bp->bif_ifp);
 311         free(bp, M_BPF);
 312 }
 313
 314 static void
 315 bpfif_ref(struct bpf_if *bp)
 316 {
 317
 318         refcount_acquire(&bp->bif_refcnt);
 319 }
 320
 321 static void
 322 bpfif_rele(struct bpf_if *bp)
 323 {
 324
 325         if (!refcount_release(&bp->bif_refcnt))
 326                 return;
 327         NET_EPOCH_CALL(bpfif_free, &bp->epoch_ctx);
 328 }
 329
 330 static void
 331 bpfd_ref(struct bpf_d *d)
 332 {
 333
 334         refcount_acquire(&d->bd_refcnt);
 335 }
 336
 337 static void
 338 bpfd_rele(struct bpf_d *d)
 339 {
 340
 341         if (!refcount_release(&d->bd_refcnt))
 342                 return;
 343         NET_EPOCH_CALL(bpfd_free, &d->epoch_ctx);
 344 }
 345
 346 static struct bpf_program_buffer*
 347 bpf_program_buffer_alloc(size_t size, int flags)
 348 {
 349
 350         return (malloc(sizeof(struct bpf_program_buffer) + size,
 351             M_BPF, flags));
 352 }
 353
 354 static void
 355 bpf_program_buffer_free(epoch_context_t ctx)
 356 {
 357         struct bpf_program_buffer *ptr;
 358
 359         ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx);
 360 #ifdef BPF_JITTER
 361         if (ptr->func != NULL)
 362                 bpf_destroy_jit_filter(ptr->func);
 363 #endif
 364         free(ptr, M_BPF);
 365 }
 366
 367 /*
 368  * Wrapper functions for various buffering methods.  If the set of buffer
 369  * modes expands, we will probably want to introduce a switch data structure
 370  * similar to protosw, et.
 371  */
 372 static void
 373 bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
 374     u_int len)
 375 {
 376
 377         BPFD_LOCK_ASSERT(d);
 378
 379         switch (d->bd_bufmode) {
 380         case BPF_BUFMODE_BUFFER:
 381                 return (bpf_buffer_append_bytes(d, buf, offset, src, len));
 382
 383         case BPF_BUFMODE_ZBUF:
 384                 counter_u64_add(d->bd_zcopy, 1);
 385                 return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
 386
 387         default:
 388                 panic("bpf_buf_append_bytes");
 389         }
 390 }
 391
 392 static void
 393 bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
 394     u_int len)
 395 {
 396
 397         BPFD_LOCK_ASSERT(d);
 398
 399         switch (d->bd_bufmode) {
 400         case BPF_BUFMODE_BUFFER:
 401                 return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
 402
 403         case BPF_BUFMODE_ZBUF:
 404                 counter_u64_add(d->bd_zcopy, 1);
 405                 return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
 406
 407         default:
 408                 panic("bpf_buf_append_mbuf");
 409         }
 410 }
 411
 412 /*
 413  * This function gets called when the free buffer is re-assigned.
 414  */
 415 static void
 416 bpf_buf_reclaimed(struct bpf_d *d)
 417 {
 418
 419         BPFD_LOCK_ASSERT(d);
 420
 421         switch (d->bd_bufmode) {
 422         case BPF_BUFMODE_BUFFER:
 423                 return;
 424
 425         case BPF_BUFMODE_ZBUF:
 426                 bpf_zerocopy_buf_reclaimed(d);
 427                 return;
 428
 429         default:
 430                 panic("bpf_buf_reclaimed");
 431         }
 432 }
 433
 434 /*
 435  * If the buffer mechanism has a way to decide that a held buffer can be made
 436  * free, then it is exposed via the bpf_canfreebuf() interface.  (1) is
 437  * returned if the buffer can be discarded, (0) is returned if it cannot.
 438  */
 439 static int
 440 bpf_canfreebuf(struct bpf_d *d)
 441 {
 442
 443         BPFD_LOCK_ASSERT(d);
 444
 445         switch (d->bd_bufmode) {
 446         case BPF_BUFMODE_ZBUF:
 447                 return (bpf_zerocopy_canfreebuf(d));
 448         }
 449         return (0);
 450 }
 451
 452 /*
 453  * Allow the buffer model to indicate that the current store buffer is
 454  * immutable, regardless of the appearance of space.  Return (1) if the
 455  * buffer is writable, and (0) if not.
 456  */
 457 static int
 458 bpf_canwritebuf(struct bpf_d *d)
 459 {
 460         BPFD_LOCK_ASSERT(d);
 461
 462         switch (d->bd_bufmode) {
 463         case BPF_BUFMODE_ZBUF:
 464                 return (bpf_zerocopy_canwritebuf(d));
 465         }
 466         return (1);
 467 }
 468
 469 /*
 470  * Notify buffer model that an attempt to write to the store buffer has
 471  * resulted in a dropped packet, in which case the buffer may be considered
 472  * full.
 473  */
 474 static void
 475 bpf_buffull(struct bpf_d *d)
 476 {
 477
 478         BPFD_LOCK_ASSERT(d);
 479
 480         switch (d->bd_bufmode) {
 481         case BPF_BUFMODE_ZBUF:
 482                 bpf_zerocopy_buffull(d);
 483                 break;
 484         }
 485 }
 486
 487 /*
 488  * Notify the buffer model that a buffer has moved into the hold position.
 489  */
 490 void
 491 bpf_bufheld(struct bpf_d *d)
 492 {
 493
 494         BPFD_LOCK_ASSERT(d);
 495
 496         switch (d->bd_bufmode) {
 497         case BPF_BUFMODE_ZBUF:
 498                 bpf_zerocopy_bufheld(d);
 499                 break;
 500         }
 501 }
 502
 503 static void
 504 bpf_free(struct bpf_d *d)
 505 {
 506
 507         switch (d->bd_bufmode) {
 508         case BPF_BUFMODE_BUFFER:
 509                 return (bpf_buffer_free(d));
 510
 511         case BPF_BUFMODE_ZBUF:
 512                 return (bpf_zerocopy_free(d));
 513
 514         default:
 515                 panic("bpf_buf_free");
 516         }
 517 }
 518
 519 static int
 520 bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
 521 {
 522
 523         if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 524                 return (EOPNOTSUPP);
 525         return (bpf_buffer_uiomove(d, buf, len, uio));
 526 }
 527
 528 static int
 529 bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
 530 {
 531
 532         if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
 533                 return (EOPNOTSUPP);
 534         return (bpf_buffer_ioctl_sblen(d, i));
 535 }
 536
 537 static int
 538 bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, size_t *i)
 539 {
 540
 541         if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 542                 return (EOPNOTSUPP);
 543         return (bpf_zerocopy_ioctl_getzmax(td, d, i));
 544 }
 545
 546 static int
 547 bpf_ioctl_rotzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 548 {
 549
 550         if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 551                 return (EOPNOTSUPP);
 552         return (bpf_zerocopy_ioctl_rotzbuf(td, d, bz));
 553 }
 554
 555 static int
 556 bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
 557 {
 558
 559         if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
 560                 return (EOPNOTSUPP);
 561         return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
 562 }
 563
 564 /*
 565  * General BPF functions.
 566  */
 567 static int
 568 bpf_movein(struct uio *uio, int linktype, struct ifnet *ifp, struct mbuf **mp,
 569     struct sockaddr *sockp, int *hdrlen, struct bpf_d *d)
 570 {
 571         const struct ieee80211_bpf_params *p;
 572         struct ether_header *eh;
 573         struct mbuf *m;
 574         int error;
 575         int len;
 576         int hlen;
 577         int slen;
 578
 579         /*
 580          * Build a sockaddr based on the data link layer type.
 581          * We do this at this level because the ethernet header
 582          * is copied directly into the data field of the sockaddr.
 583          * In the case of SLIP, there is no header and the packet
 584          * is forwarded as is.
 585          * Also, we are careful to leave room at the front of the mbuf
 586          * for the link level header.
 587          */
 588         switch (linktype) {
 589         case DLT_SLIP:
 590                 sockp->sa_family = AF_INET;
 591                 hlen = 0;
 592                 break;
 593
 594         case DLT_EN10MB:
 595                 sockp->sa_family = AF_UNSPEC;
 596                 /* XXX Would MAXLINKHDR be better? */
 597                 hlen = ETHER_HDR_LEN;
 598                 break;
 599
 600         case DLT_FDDI:
 601                 sockp->sa_family = AF_IMPLINK;
 602                 hlen = 0;
 603                 break;
 604
 605         case DLT_RAW:
 606                 sockp->sa_family = AF_UNSPEC;
 607                 hlen = 0;
 608                 break;
 609
 610         case DLT_NULL:
 611                 /*
 612                  * null interface types require a 4 byte pseudo header which
 613                  * corresponds to the address family of the packet.
 614                  */
 615                 sockp->sa_family = AF_UNSPEC;
 616                 hlen = 4;
 617                 break;
 618
 619         case DLT_ATM_RFC1483:
 620                 /*
 621                  * en atm driver requires 4-byte atm pseudo header.
 622                  * though it isn't standard, vpi:vci needs to be
 623                  * specified anyway.
 624                  */
 625                 sockp->sa_family = AF_UNSPEC;
 626                 hlen = 12;      /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
 627                 break;
 628
 629         case DLT_PPP:
 630                 sockp->sa_family = AF_UNSPEC;
 631                 hlen = 4;       /* This should match PPP_HDRLEN */
 632                 break;
 633
 634         case DLT_IEEE802_11:            /* IEEE 802.11 wireless */
 635                 sockp->sa_family = AF_IEEE80211;
 636                 hlen = 0;
 637                 break;
 638
 639         case DLT_IEEE802_11_RADIO:      /* IEEE 802.11 wireless w/ phy params */
 640                 sockp->sa_family = AF_IEEE80211;
 641                 sockp->sa_len = 12;     /* XXX != 0 */
 642                 hlen = sizeof(struct ieee80211_bpf_params);
 643                 break;
 644
 645         default:
 646                 return (EIO);
 647         }
 648
 649         len = uio->uio_resid;
 650         if (len < hlen || len - hlen > ifp->if_mtu)
 651                 return (EMSGSIZE);
 652
 653         /* Allocate a mbuf for our write, since m_get2 fails if len >= to MJUMPAGESIZE, use m_getjcl for bigger buffers */
 654         m = m_get3(len, M_WAITOK, MT_DATA, M_PKTHDR);
 655         if (m == NULL)
 656                 return (EIO);
 657         m->m_pkthdr.len = m->m_len = len;
 658         *mp = m;
 659
 660         error = uiomove(mtod(m, u_char *), len, uio);
 661         if (error)
 662                 goto bad;
 663
 664         slen = bpf_filter(d->bd_wfilter, mtod(m, u_char *), len, len);
 665         if (slen == 0) {
 666                 error = EPERM;
 667                 goto bad;
 668         }
 669
 670         /* Check for multicast destination */
 671         switch (linktype) {
 672         case DLT_EN10MB:
 673                 eh = mtod(m, struct ether_header *);
 674                 if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
 675                         if (bcmp(ifp->if_broadcastaddr, eh->ether_dhost,
 676                             ETHER_ADDR_LEN) == 0)
 677                                 m->m_flags |= M_BCAST;
 678                         else
 679                                 m->m_flags |= M_MCAST;
 680                 }
 681                 if (d->bd_hdrcmplt == 0) {
 682                         memcpy(eh->ether_shost, IF_LLADDR(ifp),
 683                             sizeof(eh->ether_shost));
 684                 }
 685                 break;
 686         }
 687
 688         /*
 689          * Make room for link header, and copy it to sockaddr
 690          */
 691         if (hlen != 0) {
 692                 if (sockp->sa_family == AF_IEEE80211) {
 693                         /*
 694                          * Collect true length from the parameter header
 695                          * NB: sockp is known to be zero'd so if we do a
 696                          *     short copy unspecified parameters will be
 697                          *     zero.
 698                          * NB: packet may not be aligned after stripping
 699                          *     bpf params
 700                          * XXX check ibp_vers
 701                          */
 702                         p = mtod(m, const struct ieee80211_bpf_params *);
 703                         hlen = p->ibp_len;
 704                         if (hlen > sizeof(sockp->sa_data)) {
 705                                 error = EINVAL;
 706                                 goto bad;
 707                         }
 708                 }
 709                 bcopy(mtod(m, const void *), sockp->sa_data, hlen);
 710         }
 711         *hdrlen = hlen;
 712
 713         return (0);
 714 bad:
 715         m_freem(m);
 716         return (error);
 717 }
 718
 719 /*
 720  * Attach descriptor to the bpf interface, i.e. make d listen on bp,
 721  * then reset its buffers and counters with reset_d().
 722  */
 723 static void
 724 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 725 {
 726         int op_w;
 727
 728         BPF_LOCK_ASSERT();
 729
 730         /*
 731          * Save sysctl value to protect from sysctl change
 732          * between reads
 733          */
 734         op_w = V_bpf_optimize_writers || d->bd_writer;
 735
 736         if (d->bd_bif != NULL)
 737                 bpf_detachd_locked(d, false);
 738         /*
 739          * Point d at bp, and add d to the interface's list.
 740          * Since there are many applications using BPF for
 741          * sending raw packets only (dhcpd, cdpd are good examples)
 742          * we can delay adding d to the list of active listeners until
 743          * some filter is configured.
 744          */
 745
 746         BPFD_LOCK(d);
 747         /*
 748          * Hold reference to bpif while descriptor uses this interface.
 749          */
 750         bpfif_ref(bp);
 751         d->bd_bif = bp;
 752         if (op_w != 0) {
 753                 /* Add to writers-only list */
 754                 CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
 755                 /*
 756                  * We decrement bd_writer on every filter set operation.
 757                  * First BIOCSETF is done by pcap_open_live() to set up
 758                  * snap length. After that appliation usually sets its own
 759                  * filter.
 760                  */
 761                 d->bd_writer = 2;
 762         } else
 763                 CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 764
 765         reset_d(d);
 766
 767         /* Trigger EVFILT_WRITE events. */
 768         bpf_wakeup(d);
 769
 770         BPFD_UNLOCK(d);
 771         bpf_bpfd_cnt++;
 772
 773         CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
 774             __func__, d->bd_pid, d->bd_writer ? "writer" : "active");
 775
 776         if (op_w == 0)
 777                 EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
 778 }
 779
 780 /*
 781  * Check if we need to upgrade our descriptor @d from write-only mode.
 782  */
 783 static int
 784 bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode,
 785     int flen)
 786 {
 787         int is_snap, need_upgrade;
 788
 789         /*
 790          * Check if we've already upgraded or new filter is empty.
 791          */
 792         if (d->bd_writer == 0 || fcode == NULL)
 793                 return (0);
 794
 795         need_upgrade = 0;
 796
 797         /*
 798          * Check if cmd looks like snaplen setting from
 799          * pcap_bpf.c:pcap_open_live().
 800          * Note we're not checking .k value here:
 801          * while pcap_open_live() definitely sets to non-zero value,
 802          * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
 803          * do not consider upgrading immediately
 804          */
 805         if (cmd == BIOCSETF && flen == 1 &&
 806             fcode[0].code == (BPF_RET | BPF_K))
 807                 is_snap = 1;
 808         else
 809                 is_snap = 0;
 810
 811         if (is_snap == 0) {
 812                 /*
 813                  * We're setting first filter and it doesn't look like
 814                  * setting snaplen.  We're probably using bpf directly.
 815                  * Upgrade immediately.
 816                  */
 817                 need_upgrade = 1;
 818         } else {
 819                 /*
 820                  * Do not require upgrade by first BIOCSETF
 821                  * (used to set snaplen) by pcap_open_live().
 822                  */
 823
 824                 if (--d->bd_writer == 0) {
 825                         /*
 826                          * First snaplen filter has already
 827                          * been set. This is probably catch-all
 828                          * filter
 829                          */
 830                         need_upgrade = 1;
 831                 }
 832         }
 833
 834         CTR5(KTR_NET,
 835             "%s: filter function set by pid %d, "
 836             "bd_writer counter %d, snap %d upgrade %d",
 837             __func__, d->bd_pid, d->bd_writer,
 838             is_snap, need_upgrade);
 839
 840         return (need_upgrade);
 841 }
 842
 843 /*
 844  * Detach a file from its interface.
 845  */
 846 static void
 847 bpf_detachd(struct bpf_d *d)
 848 {
 849         BPF_LOCK();
 850         bpf_detachd_locked(d, false);
 851         BPF_UNLOCK();
 852 }
 853
 854 static void
 855 bpf_detachd_locked(struct bpf_d *d, bool detached_ifp)
 856 {
 857         struct bpf_if *bp;
 858         struct ifnet *ifp;
 859         int error;
 860
 861         BPF_LOCK_ASSERT();
 862         CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
 863
 864         /* Check if descriptor is attached */
 865         if ((bp = d->bd_bif) == NULL)
 866                 return;
 867
 868         BPFD_LOCK(d);
 869         /* Remove d from the interface's descriptor list. */
 870         CK_LIST_REMOVE(d, bd_next);
 871         /* Save bd_writer value */
 872         error = d->bd_writer;
 873         ifp = bp->bif_ifp;
 874         d->bd_bif = NULL;
 875         if (detached_ifp) {
 876                 /*
 877                  * Notify descriptor as it's detached, so that any
 878                  * sleepers wake up and get ENXIO.
 879                  */
 880                 bpf_wakeup(d);
 881         }
 882         BPFD_UNLOCK(d);
 883         bpf_bpfd_cnt--;
 884
 885         /* Call event handler iff d is attached */
 886         if (error == 0)
 887                 EVENTHANDLER_INVOKE(bpf_track, ifp, bp->bif_dlt, 0);
 888
 889         /*
 890          * Check if this descriptor had requested promiscuous mode.
 891          * If so and ifnet is not detached, turn it off.
 892          */
 893         if (d->bd_promisc && !detached_ifp) {
 894                 d->bd_promisc = 0;
 895                 CURVNET_SET(ifp->if_vnet);
 896                 error = ifpromisc(ifp, 0);
 897                 CURVNET_RESTORE();
 898                 if (error != 0 && error != ENXIO) {
 899                         /*
 900                          * ENXIO can happen if a pccard is unplugged
 901                          * Something is really wrong if we were able to put
 902                          * the driver into promiscuous mode, but can't
 903                          * take it out.
 904                          */
 905                         if_printf(bp->bif_ifp,
 906                                 "bpf_detach: ifpromisc failed (%d)\n", error);
 907                 }
 908         }
 909         bpfif_rele(bp);
 910 }
 911
 912 /*
 913  * Close the descriptor by detaching it from its interface,
 914  * deallocating its buffers, and marking it free.
 915  */
 916 static void
 917 bpf_dtor(void *data)
 918 {
 919         struct bpf_d *d = data;
 920
 921         BPFD_LOCK(d);
 922         if (d->bd_state == BPF_WAITING)
 923                 callout_stop(&d->bd_callout);
 924         d->bd_state = BPF_IDLE;
 925         BPFD_UNLOCK(d);
 926         funsetown(&d->bd_sigio);
 927         bpf_detachd(d);
 928 #ifdef MAC
 929         mac_bpfdesc_destroy(d);
 930 #endif /* MAC */
 931         seldrain(&d->bd_sel);
 932         knlist_destroy(&d->bd_sel.si_note);
 933         callout_drain(&d->bd_callout);
 934         bpfd_rele(d);
 935 }
 936
 937 /*
 938  * Open ethernet device.  Returns ENXIO for illegal minor device number,
 939  * EBUSY if file is open by another process.
 940  */
 941 /* ARGSUSED */
 942 static  int
 943 bpfopen(struct cdev *dev, int flags, int fmt, struct thread *td)
 944 {
 945         struct bpf_d *d;
 946         int error;
 947
 948         d = malloc(sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 949         error = devfs_set_cdevpriv(d, bpf_dtor);
 950         if (error != 0) {
 951                 free(d, M_BPF);
 952                 return (error);
 953         }
 954
 955         /* Setup counters */
 956         d->bd_rcount = counter_u64_alloc(M_WAITOK);
 957         d->bd_dcount = counter_u64_alloc(M_WAITOK);
 958         d->bd_fcount = counter_u64_alloc(M_WAITOK);
 959         d->bd_wcount = counter_u64_alloc(M_WAITOK);
 960         d->bd_wfcount = counter_u64_alloc(M_WAITOK);
 961         d->bd_wdcount = counter_u64_alloc(M_WAITOK);
 962         d->bd_zcopy = counter_u64_alloc(M_WAITOK);
 963
 964         /*
 965          * For historical reasons, perform a one-time initialization call to
 966          * the buffer routines, even though we're not yet committed to a
 967          * particular buffer method.
 968          */
 969         bpf_buffer_init(d);
 970         if ((flags & FREAD) == 0)
 971                 d->bd_writer = 2;
 972         d->bd_hbuf_in_use = 0;
 973         d->bd_bufmode = BPF_BUFMODE_BUFFER;
 974         d->bd_sig = SIGIO;
 975         d->bd_direction = BPF_D_INOUT;
 976         d->bd_refcnt = 1;
 977         BPF_PID_REFRESH(d, td);
 978 #ifdef MAC
 979         mac_bpfdesc_init(d);
 980         mac_bpfdesc_create(td->td_ucred, d);
 981 #endif
 982         mtx_init(&d->bd_lock, devtoname(dev), "bpf cdev lock", MTX_DEF);
 983         callout_init_mtx(&d->bd_callout, &d->bd_lock, 0);
 984         knlist_init_mtx(&d->bd_sel.si_note, &d->bd_lock);
 985
 986         /* Disable VLAN pcp tagging. */
 987         d->bd_pcp = 0;
 988
 989         return (0);
 990 }
 991
 992 /*
 993  *  bpfread - read next chunk of packets from buffers
 994  */
 995 static  int
 996 bpfread(struct cdev *dev, struct uio *uio, int ioflag)
 997 {
 998         struct bpf_d *d;
 999         int error;
1000         int non_block;
1001         int timed_out;
1002
1003         error = devfs_get_cdevpriv((void **)&d);
1004         if (error != 0)
1005                 return (error);
1006
1007         /*
1008          * Restrict application to use a buffer the same size as
1009          * as kernel buffers.
1010          */
1011         if (uio->uio_resid != d->bd_bufsize)
1012                 return (EINVAL);
1013
1014         non_block = ((ioflag & O_NONBLOCK) != 0);
1015
1016         BPFD_LOCK(d);
1017         BPF_PID_REFRESH_CUR(d);
1018         if (d->bd_bufmode != BPF_BUFMODE_BUFFER) {
1019                 BPFD_UNLOCK(d);
1020                 return (EOPNOTSUPP);
1021         }
1022         if (d->bd_state == BPF_WAITING)
1023                 callout_stop(&d->bd_callout);
1024         timed_out = (d->bd_state == BPF_TIMED_OUT);
1025         d->bd_state = BPF_IDLE;
1026         while (d->bd_hbuf_in_use) {
1027                 error = mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
1028                     PRINET|PCATCH, "bd_hbuf", 0);
1029                 if (error != 0) {
1030                         BPFD_UNLOCK(d);
1031                         return (error);
1032                 }
1033         }
1034         /*
1035          * If the hold buffer is empty, then do a timed sleep, which
1036          * ends when the timeout expires or when enough packets
1037          * have arrived to fill the store buffer.
1038          */
1039         while (d->bd_hbuf == NULL) {
1040                 if (d->bd_slen != 0) {
1041                         /*
1042                          * A packet(s) either arrived since the previous
1043                          * read or arrived while we were asleep.
1044                          */
1045                         if (d->bd_immediate || non_block || timed_out) {
1046                                 /*
1047                                  * Rotate the buffers and return what's here
1048                                  * if we are in immediate mode, non-blocking
1049                                  * flag is set, or this descriptor timed out.
1050                                  */
1051                                 ROTATE_BUFFERS(d);
1052                                 break;
1053                         }
1054                 }
1055
1056                 /*
1057                  * No data is available, check to see if the bpf device
1058                  * is still pointed at a real interface.  If not, return
1059                  * ENXIO so that the userland process knows to rebind
1060                  * it before using it again.
1061                  */
1062                 if (d->bd_bif == NULL) {
1063                         BPFD_UNLOCK(d);
1064                         return (ENXIO);
1065                 }
1066
1067                 if (non_block) {
1068                         BPFD_UNLOCK(d);
1069                         return (EWOULDBLOCK);
1070                 }
1071                 error = msleep(d, &d->bd_lock, PRINET|PCATCH,
1072                      "bpf", d->bd_rtout);
1073                 if (error == EINTR || error == ERESTART) {
1074                         BPFD_UNLOCK(d);
1075                         return (error);
1076                 }
1077                 if (error == EWOULDBLOCK) {
1078                         /*
1079                          * On a timeout, return what's in the buffer,
1080                          * which may be nothing.  If there is something
1081                          * in the store buffer, we can rotate the buffers.
1082                          */
1083                         if (d->bd_hbuf)
1084                                 /*
1085                                  * We filled up the buffer in between
1086                                  * getting the timeout and arriving
1087                                  * here, so we don't need to rotate.
1088                                  */
1089                                 break;
1090
1091                         if (d->bd_slen == 0) {
1092                                 BPFD_UNLOCK(d);
1093                                 return (0);
1094                         }
1095                         ROTATE_BUFFERS(d);
1096                         break;
1097                 }
1098         }
1099         /*
1100          * At this point, we know we have something in the hold slot.
1101          */
1102         d->bd_hbuf_in_use = 1;
1103         BPFD_UNLOCK(d);
1104
1105         /*
1106          * Move data from hold buffer into user space.
1107          * We know the entire buffer is transferred since
1108          * we checked above that the read buffer is bpf_bufsize bytes.
1109          *
1110          * We do not have to worry about simultaneous reads because
1111          * we waited for sole access to the hold buffer above.
1112          */
1113         error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
1114
1115         BPFD_LOCK(d);
1116         KASSERT(d->bd_hbuf != NULL, ("bpfread: lost bd_hbuf"));
1117         d->bd_fbuf = d->bd_hbuf;
1118         d->bd_hbuf = NULL;
1119         d->bd_hlen = 0;
1120         bpf_buf_reclaimed(d);
1121         d->bd_hbuf_in_use = 0;
1122         wakeup(&d->bd_hbuf_in_use);
1123         BPFD_UNLOCK(d);
1124
1125         return (error);
1126 }
1127
1128 /*
1129  * If there are processes sleeping on this descriptor, wake them up.
1130  */
1131 static __inline void
1132 bpf_wakeup(struct bpf_d *d)
1133 {
1134
1135         BPFD_LOCK_ASSERT(d);
1136         if (d->bd_state == BPF_WAITING) {
1137                 callout_stop(&d->bd_callout);
1138                 d->bd_state = BPF_IDLE;
1139         }
1140         wakeup(d);
1141         if (d->bd_async && d->bd_sig && d->bd_sigio)
1142                 pgsigio(&d->bd_sigio, d->bd_sig, 0);
1143
1144         selwakeuppri(&d->bd_sel, PRINET);
1145         KNOTE_LOCKED(&d->bd_sel.si_note, 0);
1146 }
1147
1148 static void
1149 bpf_timed_out(void *arg)
1150 {
1151         struct bpf_d *d = (struct bpf_d *)arg;
1152
1153         BPFD_LOCK_ASSERT(d);
1154
1155         if (callout_pending(&d->bd_callout) ||
1156             !callout_active(&d->bd_callout))
1157                 return;
1158         if (d->bd_state == BPF_WAITING) {
1159                 d->bd_state = BPF_TIMED_OUT;
1160                 if (d->bd_slen != 0)
1161                         bpf_wakeup(d);
1162         }
1163 }
1164
1165 static int
1166 bpf_ready(struct bpf_d *d)
1167 {
1168
1169         BPFD_LOCK_ASSERT(d);
1170
1171         if (!bpf_canfreebuf(d) && d->bd_hlen != 0)
1172                 return (1);
1173         if ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
1174             d->bd_slen != 0)
1175                 return (1);
1176         return (0);
1177 }
1178
1179 static int
1180 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
1181 {
1182         struct route ro;
1183         struct sockaddr dst;
1184         struct epoch_tracker et;
1185         struct bpf_if *bp;
1186         struct bpf_d *d;
1187         struct ifnet *ifp;
1188         struct mbuf *m, *mc;
1189         int error, hlen;
1190
1191         error = devfs_get_cdevpriv((void **)&d);
1192         if (error != 0)
1193                 return (error);
1194
1195         NET_EPOCH_ENTER(et);
1196         BPFD_LOCK(d);
1197         BPF_PID_REFRESH_CUR(d);
1198         counter_u64_add(d->bd_wcount, 1);
1199         if ((bp = d->bd_bif) == NULL) {
1200                 error = ENXIO;
1201                 goto out_locked;
1202         }
1203
1204         ifp = bp->bif_ifp;
1205         if ((ifp->if_flags & IFF_UP) == 0) {
1206                 error = ENETDOWN;
1207                 goto out_locked;
1208         }
1209
1210         if (uio->uio_resid == 0)
1211                 goto out_locked;
1212
1213         bzero(&dst, sizeof(dst));
1214         m = NULL;
1215         hlen = 0;
1216
1217         /*
1218          * Take extra reference, unlock d and exit from epoch section,
1219          * since bpf_movein() can sleep.
1220          */
1221         bpfd_ref(d);
1222         NET_EPOCH_EXIT(et);
1223         BPFD_UNLOCK(d);
1224
1225         error = bpf_movein(uio, (int)bp->bif_dlt, ifp,
1226             &m, &dst, &hlen, d);
1227
1228         if (error != 0) {
1229                 counter_u64_add(d->bd_wdcount, 1);
1230                 bpfd_rele(d);
1231                 return (error);
1232         }
1233
1234         BPFD_LOCK(d);
1235         /*
1236          * Check that descriptor is still attached to the interface.
1237          * This can happen on bpfdetach(). To avoid access to detached
1238          * ifnet, free mbuf and return ENXIO.
1239          */
1240         if (d->bd_bif == NULL) {
1241                 counter_u64_add(d->bd_wdcount, 1);
1242                 BPFD_UNLOCK(d);
1243                 bpfd_rele(d);
1244                 m_freem(m);
1245                 return (ENXIO);
1246         }
1247         counter_u64_add(d->bd_wfcount, 1);
1248         if (d->bd_hdrcmplt)
1249                 dst.sa_family = pseudo_AF_HDRCMPLT;
1250
1251         if (d->bd_feedback) {
1252                 mc = m_dup(m, M_NOWAIT);
1253                 if (mc != NULL)
1254                         mc->m_pkthdr.rcvif = ifp;
1255                 /* Set M_PROMISC for outgoing packets to be discarded. */
1256                 if (d->bd_direction == BPF_D_INOUT)
1257                         m->m_flags |= M_PROMISC;
1258         } else
1259                 mc = NULL;
1260
1261         m->m_pkthdr.len -= hlen;
1262         m->m_len -= hlen;
1263         m->m_data += hlen;      /* XXX */
1264
1265         CURVNET_SET(ifp->if_vnet);
1266 #ifdef MAC
1267         mac_bpfdesc_create_mbuf(d, m);
1268         if (mc != NULL)
1269                 mac_bpfdesc_create_mbuf(d, mc);
1270 #endif
1271
1272         bzero(&ro, sizeof(ro));
1273         if (hlen != 0) {
1274                 ro.ro_prepend = (u_char *)&dst.sa_data;
1275                 ro.ro_plen = hlen;
1276                 ro.ro_flags = RT_HAS_HEADER;
1277         }
1278
1279         if (d->bd_pcp != 0)
1280                 vlan_set_pcp(m, d->bd_pcp);
1281
1282         /* Avoid possible recursion on BPFD_LOCK(). */
1283         NET_EPOCH_ENTER(et);
1284         BPFD_UNLOCK(d);
1285         error = (*ifp->if_output)(ifp, m, &dst, &ro);
1286         if (error)
1287                 counter_u64_add(d->bd_wdcount, 1);
1288
1289         if (mc != NULL) {
1290                 if (error == 0)
1291                         (*ifp->if_input)(ifp, mc);
1292                 else
1293                         m_freem(mc);
1294         }
1295         NET_EPOCH_EXIT(et);
1296         CURVNET_RESTORE();
1297         bpfd_rele(d);
1298         return (error);
1299
1300 out_locked:
1301         counter_u64_add(d->bd_wdcount, 1);
1302         NET_EPOCH_EXIT(et);
1303         BPFD_UNLOCK(d);
1304         return (error);
1305 }
1306
1307 /*
1308  * Reset a descriptor by flushing its packet buffer and clearing the receive
1309  * and drop counts.  This is doable for kernel-only buffers, but with
1310  * zero-copy buffers, we can't write to (or rotate) buffers that are
1311  * currently owned by userspace.  It would be nice if we could encapsulate
1312  * this logic in the buffer code rather than here.
1313  */
1314 static void
1315 reset_d(struct bpf_d *d)
1316 {
1317
1318         BPFD_LOCK_ASSERT(d);
1319
1320         while (d->bd_hbuf_in_use)
1321                 mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock, PRINET,
1322                     "bd_hbuf", 0);
1323         if ((d->bd_hbuf != NULL) &&
1324             (d->bd_bufmode != BPF_BUFMODE_ZBUF || bpf_canfreebuf(d))) {
1325                 /* Free the hold buffer. */
1326                 d->bd_fbuf = d->bd_hbuf;
1327                 d->bd_hbuf = NULL;
1328                 d->bd_hlen = 0;
1329                 bpf_buf_reclaimed(d);
1330         }
1331         if (bpf_canwritebuf(d))
1332                 d->bd_slen = 0;
1333         counter_u64_zero(d->bd_rcount);
1334         counter_u64_zero(d->bd_dcount);
1335         counter_u64_zero(d->bd_fcount);
1336         counter_u64_zero(d->bd_wcount);
1337         counter_u64_zero(d->bd_wfcount);
1338         counter_u64_zero(d->bd_wdcount);
1339         counter_u64_zero(d->bd_zcopy);
1340 }
1341
1342 /*
1343  *  FIONREAD            Check for read packet available.
1344  *  BIOCGBLEN           Get buffer len [for read()].
1345  *  BIOCSETF            Set read filter.
1346  *  BIOCSETFNR          Set read filter without resetting descriptor.
1347  *  BIOCSETWF           Set write filter.
1348  *  BIOCFLUSH           Flush read packet buffer.
1349  *  BIOCPROMISC         Put interface into promiscuous mode.
1350  *  BIOCGDLT            Get link layer type.
1351  *  BIOCGETIF           Get interface name.
1352  *  BIOCSETIF           Set interface.
1353  *  BIOCSRTIMEOUT       Set read timeout.
1354  *  BIOCGRTIMEOUT       Get read timeout.
1355  *  BIOCGSTATS          Get packet stats.
1356  *  BIOCIMMEDIATE       Set immediate mode.
1357  *  BIOCVERSION         Get filter language version.
1358  *  BIOCGHDRCMPLT       Get "header already complete" flag
1359  *  BIOCSHDRCMPLT       Set "header already complete" flag
1360  *  BIOCGDIRECTION      Get packet direction flag
1361  *  BIOCSDIRECTION      Set packet direction flag
1362  *  BIOCGTSTAMP         Get time stamp format and resolution.
1363  *  BIOCSTSTAMP         Set time stamp format and resolution.
1364  *  BIOCLOCK            Set "locked" flag
1365  *  BIOCFEEDBACK        Set packet feedback mode.
1366  *  BIOCSETZBUF         Set current zero-copy buffer locations.
1367  *  BIOCGETZMAX         Get maximum zero-copy buffer size.
1368  *  BIOCROTZBUF         Force rotation of zero-copy buffer
1369  *  BIOCSETBUFMODE      Set buffer mode.
1370  *  BIOCGETBUFMODE      Get current buffer mode.
1371  *  BIOCSETVLANPCP      Set VLAN PCP tag.
1372  */
1373 /* ARGSUSED */
1374 static  int
1375 bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
1376     struct thread *td)
1377 {
1378         struct bpf_d *d;
1379         int error;
1380
1381         error = devfs_get_cdevpriv((void **)&d);
1382         if (error != 0)
1383                 return (error);
1384
1385         /*
1386          * Refresh PID associated with this descriptor.
1387          */
1388         BPFD_LOCK(d);
1389         BPF_PID_REFRESH(d, td);
1390         if (d->bd_state == BPF_WAITING)
1391                 callout_stop(&d->bd_callout);
1392         d->bd_state = BPF_IDLE;
1393         BPFD_UNLOCK(d);
1394
1395         if (d->bd_locked == 1) {
1396                 switch (cmd) {
1397                 case BIOCGBLEN:
1398                 case BIOCFLUSH:
1399                 case BIOCGDLT:
1400                 case BIOCGDLTLIST:
1401 #ifdef COMPAT_FREEBSD32
1402                 case BIOCGDLTLIST32:
1403 #endif
1404                 case BIOCGETIF:
1405                 case BIOCGRTIMEOUT:
1406 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1407                 case BIOCGRTIMEOUT32:
1408 #endif
1409                 case BIOCGSTATS:
1410                 case BIOCVERSION:
1411                 case BIOCGRSIG:
1412                 case BIOCGHDRCMPLT:
1413                 case BIOCSTSTAMP:
1414                 case BIOCFEEDBACK:
1415                 case FIONREAD:
1416                 case BIOCLOCK:
1417                 case BIOCSRTIMEOUT:
1418 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1419                 case BIOCSRTIMEOUT32:
1420 #endif
1421                 case BIOCIMMEDIATE:
1422                 case TIOCGPGRP:
1423                 case BIOCROTZBUF:
1424                         break;
1425                 default:
1426                         return (EPERM);
1427                 }
1428         }
1429 #ifdef COMPAT_FREEBSD32
1430         /*
1431          * If we see a 32-bit compat ioctl, mark the stream as 32-bit so
1432          * that it will get 32-bit packet headers.
1433          */
1434         switch (cmd) {
1435         case BIOCSETF32:
1436         case BIOCSETFNR32:
1437         case BIOCSETWF32:
1438         case BIOCGDLTLIST32:
1439         case BIOCGRTIMEOUT32:
1440         case BIOCSRTIMEOUT32:
1441                 if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1442                         BPFD_LOCK(d);
1443                         d->bd_compat32 = 1;
1444                         BPFD_UNLOCK(d);
1445                 }
1446         }
1447 #endif
1448
1449         CURVNET_SET(TD_TO_VNET(td));
1450         switch (cmd) {
1451         default:
1452                 error = EINVAL;
1453                 break;
1454
1455         /*
1456          * Check for read packet available.
1457          */
1458         case FIONREAD:
1459                 {
1460                         int n;
1461
1462                         BPFD_LOCK(d);
1463                         n = d->bd_slen;
1464                         while (d->bd_hbuf_in_use)
1465                                 mtx_sleep(&d->bd_hbuf_in_use, &d->bd_lock,
1466                                     PRINET, "bd_hbuf", 0);
1467                         if (d->bd_hbuf)
1468                                 n += d->bd_hlen;
1469                         BPFD_UNLOCK(d);
1470
1471                         *(int *)addr = n;
1472                         break;
1473                 }
1474
1475         /*
1476          * Get buffer len [for read()].
1477          */
1478         case BIOCGBLEN:
1479                 BPFD_LOCK(d);
1480                 *(u_int *)addr = d->bd_bufsize;
1481                 BPFD_UNLOCK(d);
1482                 break;
1483
1484         /*
1485          * Set buffer length.
1486          */
1487         case BIOCSBLEN:
1488                 error = bpf_ioctl_sblen(d, (u_int *)addr);
1489                 break;
1490
1491         /*
1492          * Set link layer read filter.
1493          */
1494         case BIOCSETF:
1495         case BIOCSETFNR:
1496         case BIOCSETWF:
1497 #ifdef COMPAT_FREEBSD32
1498         case BIOCSETF32:
1499         case BIOCSETFNR32:
1500         case BIOCSETWF32:
1501 #endif
1502                 error = bpf_setf(d, (struct bpf_program *)addr, cmd);
1503                 break;
1504
1505         /*
1506          * Flush read packet buffer.
1507          */
1508         case BIOCFLUSH:
1509                 BPFD_LOCK(d);
1510                 reset_d(d);
1511                 BPFD_UNLOCK(d);
1512                 break;
1513
1514         /*
1515          * Put interface into promiscuous mode.
1516          */
1517         case BIOCPROMISC:
1518                 if (d->bd_bif == NULL) {
1519                         /*
1520                          * No interface attached yet.
1521                          */
1522                         error = EINVAL;
1523                         break;
1524                 }
1525                 if (d->bd_promisc == 0) {
1526                         error = ifpromisc(d->bd_bif->bif_ifp, 1);
1527                         if (error == 0)
1528                                 d->bd_promisc = 1;
1529                 }
1530                 break;
1531
1532         /*
1533          * Get current data link type.
1534          */
1535         case BIOCGDLT:
1536                 BPF_LOCK();
1537                 if (d->bd_bif == NULL)
1538                         error = EINVAL;
1539                 else
1540                         *(u_int *)addr = d->bd_bif->bif_dlt;
1541                 BPF_UNLOCK();
1542                 break;
1543
1544         /*
1545          * Get a list of supported data link types.
1546          */
1547 #ifdef COMPAT_FREEBSD32
1548         case BIOCGDLTLIST32:
1549                 {
1550                         struct bpf_dltlist32 *list32;
1551                         struct bpf_dltlist dltlist;
1552
1553                         list32 = (struct bpf_dltlist32 *)addr;
1554                         dltlist.bfl_len = list32->bfl_len;
1555                         dltlist.bfl_list = PTRIN(list32->bfl_list);
1556                         BPF_LOCK();
1557                         if (d->bd_bif == NULL)
1558                                 error = EINVAL;
1559                         else {
1560                                 error = bpf_getdltlist(d, &dltlist);
1561                                 if (error == 0)
1562                                         list32->bfl_len = dltlist.bfl_len;
1563                         }
1564                         BPF_UNLOCK();
1565                         break;
1566                 }
1567 #endif
1568
1569         case BIOCGDLTLIST:
1570                 BPF_LOCK();
1571                 if (d->bd_bif == NULL)
1572                         error = EINVAL;
1573                 else
1574                         error = bpf_getdltlist(d, (struct bpf_dltlist *)addr);
1575                 BPF_UNLOCK();
1576                 break;
1577
1578         /*
1579          * Set data link type.
1580          */
1581         case BIOCSDLT:
1582                 BPF_LOCK();
1583                 if (d->bd_bif == NULL)
1584                         error = EINVAL;
1585                 else
1586                         error = bpf_setdlt(d, *(u_int *)addr);
1587                 BPF_UNLOCK();
1588                 break;
1589
1590         /*
1591          * Get interface name.
1592          */
1593         case BIOCGETIF:
1594                 BPF_LOCK();
1595                 if (d->bd_bif == NULL)
1596                         error = EINVAL;
1597                 else {
1598                         struct ifnet *const ifp = d->bd_bif->bif_ifp;
1599                         struct ifreq *const ifr = (struct ifreq *)addr;
1600
1601                         strlcpy(ifr->ifr_name, ifp->if_xname,
1602                             sizeof(ifr->ifr_name));
1603                 }
1604                 BPF_UNLOCK();
1605                 break;
1606
1607         /*
1608          * Set interface.
1609          */
1610         case BIOCSETIF:
1611                 {
1612                         int alloc_buf, size;
1613
1614                         /*
1615                          * Behavior here depends on the buffering model.  If
1616                          * we're using kernel memory buffers, then we can
1617                          * allocate them here.  If we're using zero-copy,
1618                          * then the user process must have registered buffers
1619                          * by the time we get here.
1620                          */
1621                         alloc_buf = 0;
1622                         BPFD_LOCK(d);
1623                         if (d->bd_bufmode == BPF_BUFMODE_BUFFER &&
1624                             d->bd_sbuf == NULL)
1625                                 alloc_buf = 1;
1626                         BPFD_UNLOCK(d);
1627                         if (alloc_buf) {
1628                                 size = d->bd_bufsize;
1629                                 error = bpf_buffer_ioctl_sblen(d, &size);
1630                                 if (error != 0)
1631                                         break;
1632                         }
1633                         BPF_LOCK();
1634                         error = bpf_setif(d, (struct ifreq *)addr);
1635                         BPF_UNLOCK();
1636                         break;
1637                 }
1638
1639         /*
1640          * Set read timeout.
1641          */
1642         case BIOCSRTIMEOUT:
1643 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1644         case BIOCSRTIMEOUT32:
1645 #endif
1646                 {
1647                         struct timeval *tv = (struct timeval *)addr;
1648 #if defined(COMPAT_FREEBSD32) && !defined(__mips__)
1649                         struct timeval32 *tv32;
1650                         struct timeval tv64;
1651
1652                         if (cmd == BIOCSRTIMEOUT32) {
1653                                 tv32 = (struct timeval32 *)addr;
1654                                 tv = &tv64;
1655                                 tv->tv_sec = tv32->tv_sec;
1656                                 tv->tv_usec = tv32->tv_usec;
1657                         } else
1658 #endif
1659                                 tv = (struct timeval *)addr;
1660
1661                         /*
1662                          * Subtract 1 tick from tvtohz() since this isn't
1663                          * a one-shot timer.
1664                          */
1665                         if ((error = itimerfix(tv)) == 0)
1666                                 d->bd_rtout = tvtohz(tv) - 1;
1667                         break;
1668                 }
1669
1670         /*
1671          * Get read timeout.
1672          */
1673         case BIOCGRTIMEOUT:
1674 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1675         case BIOCGRTIMEOUT32:
1676 #endif
1677                 {
1678                         struct timeval *tv;
1679 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1680                         struct timeval32 *tv32;
1681                         struct timeval tv64;
1682
1683                         if (cmd == BIOCGRTIMEOUT32)
1684                                 tv = &tv64;
1685                         else
1686 #endif
1687                                 tv = (struct timeval *)addr;
1688
1689                         tv->tv_sec = d->bd_rtout / hz;
1690                         tv->tv_usec = (d->bd_rtout % hz) * tick;
1691 #if defined(COMPAT_FREEBSD32) && defined(__amd64__)
1692                         if (cmd == BIOCGRTIMEOUT32) {
1693                                 tv32 = (struct timeval32 *)addr;
1694                                 tv32->tv_sec = tv->tv_sec;
1695                                 tv32->tv_usec = tv->tv_usec;
1696                         }
1697 #endif
1698
1699                         break;
1700                 }
1701
1702         /*
1703          * Get packet stats.
1704          */
1705         case BIOCGSTATS:
1706                 {
1707                         struct bpf_stat *bs = (struct bpf_stat *)addr;
1708
1709                         /* XXXCSJP overflow */
1710                         bs->bs_recv = (u_int)counter_u64_fetch(d->bd_rcount);
1711                         bs->bs_drop = (u_int)counter_u64_fetch(d->bd_dcount);
1712                         break;
1713                 }
1714
1715         /*
1716          * Set immediate mode.
1717          */
1718         case BIOCIMMEDIATE:
1719                 BPFD_LOCK(d);
1720                 d->bd_immediate = *(u_int *)addr;
1721                 BPFD_UNLOCK(d);
1722                 break;
1723
1724         case BIOCVERSION:
1725                 {
1726                         struct bpf_version *bv = (struct bpf_version *)addr;
1727
1728                         bv->bv_major = BPF_MAJOR_VERSION;
1729                         bv->bv_minor = BPF_MINOR_VERSION;
1730                         break;
1731                 }
1732
1733         /*
1734          * Get "header already complete" flag
1735          */
1736         case BIOCGHDRCMPLT:
1737                 BPFD_LOCK(d);
1738                 *(u_int *)addr = d->bd_hdrcmplt;
1739                 BPFD_UNLOCK(d);
1740                 break;
1741
1742         /*
1743          * Set "header already complete" flag
1744          */
1745         case BIOCSHDRCMPLT:
1746                 BPFD_LOCK(d);
1747                 d->bd_hdrcmplt = *(u_int *)addr ? 1 : 0;
1748                 BPFD_UNLOCK(d);
1749                 break;
1750
1751         /*
1752          * Get packet direction flag
1753          */
1754         case BIOCGDIRECTION:
1755                 BPFD_LOCK(d);
1756                 *(u_int *)addr = d->bd_direction;
1757                 BPFD_UNLOCK(d);
1758                 break;
1759
1760         /*
1761          * Set packet direction flag
1762          */
1763         case BIOCSDIRECTION:
1764                 {
1765                         u_int   direction;
1766
1767                         direction = *(u_int *)addr;
1768                         switch (direction) {
1769                         case BPF_D_IN:
1770                         case BPF_D_INOUT:
1771                         case BPF_D_OUT:
1772                                 BPFD_LOCK(d);
1773                                 d->bd_direction = direction;
1774                                 BPFD_UNLOCK(d);
1775                                 break;
1776                         default:
1777                                 error = EINVAL;
1778                         }
1779                 }
1780                 break;
1781
1782         /*
1783          * Get packet timestamp format and resolution.
1784          */
1785         case BIOCGTSTAMP:
1786                 BPFD_LOCK(d);
1787                 *(u_int *)addr = d->bd_tstamp;
1788                 BPFD_UNLOCK(d);
1789                 break;
1790
1791         /*
1792          * Set packet timestamp format and resolution.
1793          */
1794         case BIOCSTSTAMP:
1795                 {
1796                         u_int   func;
1797
1798                         func = *(u_int *)addr;
1799                         if (BPF_T_VALID(func))
1800                                 d->bd_tstamp = func;
1801                         else
1802                                 error = EINVAL;
1803                 }
1804                 break;
1805
1806         case BIOCFEEDBACK:
1807                 BPFD_LOCK(d);
1808                 d->bd_feedback = *(u_int *)addr;
1809                 BPFD_UNLOCK(d);
1810                 break;
1811
1812         case BIOCLOCK:
1813                 BPFD_LOCK(d);
1814                 d->bd_locked = 1;
1815                 BPFD_UNLOCK(d);
1816                 break;
1817
1818         case FIONBIO:           /* Non-blocking I/O */
1819                 break;
1820
1821         case FIOASYNC:          /* Send signal on receive packets */
1822                 BPFD_LOCK(d);
1823                 d->bd_async = *(int *)addr;
1824                 BPFD_UNLOCK(d);
1825                 break;
1826
1827         case FIOSETOWN:
1828                 /*
1829                  * XXX: Add some sort of locking here?
1830                  * fsetown() can sleep.
1831                  */
1832                 error = fsetown(*(int *)addr, &d->bd_sigio);
1833                 break;
1834
1835         case FIOGETOWN:
1836                 BPFD_LOCK(d);
1837                 *(int *)addr = fgetown(&d->bd_sigio);
1838                 BPFD_UNLOCK(d);
1839                 break;
1840
1841         /* This is deprecated, FIOSETOWN should be used instead. */
1842         case TIOCSPGRP:
1843                 error = fsetown(-(*(int *)addr), &d->bd_sigio);
1844                 break;
1845
1846         /* This is deprecated, FIOGETOWN should be used instead. */
1847         case TIOCGPGRP:
1848                 *(int *)addr = -fgetown(&d->bd_sigio);
1849                 break;
1850
1851         case BIOCSRSIG:         /* Set receive signal */
1852                 {
1853                         u_int sig;
1854
1855                         sig = *(u_int *)addr;
1856
1857                         if (sig >= NSIG)
1858                                 error = EINVAL;
1859                         else {
1860                                 BPFD_LOCK(d);
1861                                 d->bd_sig = sig;
1862                                 BPFD_UNLOCK(d);
1863                         }
1864                         break;
1865                 }
1866         case BIOCGRSIG:
1867                 BPFD_LOCK(d);
1868                 *(u_int *)addr = d->bd_sig;
1869                 BPFD_UNLOCK(d);
1870                 break;
1871
1872         case BIOCGETBUFMODE:
1873                 BPFD_LOCK(d);
1874                 *(u_int *)addr = d->bd_bufmode;
1875                 BPFD_UNLOCK(d);
1876                 break;
1877
1878         case BIOCSETBUFMODE:
1879                 /*
1880                  * Allow the buffering mode to be changed as long as we
1881                  * haven't yet committed to a particular mode.  Our
1882                  * definition of commitment, for now, is whether or not a
1883                  * buffer has been allocated or an interface attached, since
1884                  * that's the point where things get tricky.
1885                  */
1886                 switch (*(u_int *)addr) {
1887                 case BPF_BUFMODE_BUFFER:
1888                         break;
1889
1890                 case BPF_BUFMODE_ZBUF:
1891                         if (bpf_zerocopy_enable)
1892                                 break;
1893                         /* FALLSTHROUGH */
1894
1895                 default:
1896                         CURVNET_RESTORE();
1897                         return (EINVAL);
1898                 }
1899
1900                 BPFD_LOCK(d);
1901                 if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
1902                     d->bd_fbuf != NULL || d->bd_bif != NULL) {
1903                         BPFD_UNLOCK(d);
1904                         CURVNET_RESTORE();
1905                         return (EBUSY);
1906                 }
1907                 d->bd_bufmode = *(u_int *)addr;
1908                 BPFD_UNLOCK(d);
1909                 break;
1910
1911         case BIOCGETZMAX:
1912                 error = bpf_ioctl_getzmax(td, d, (size_t *)addr);
1913                 break;
1914
1915         case BIOCSETZBUF:
1916                 error = bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr);
1917                 break;
1918
1919         case BIOCROTZBUF:
1920                 error = bpf_ioctl_rotzbuf(td, d, (struct bpf_zbuf *)addr);
1921                 break;
1922
1923         case BIOCSETVLANPCP:
1924                 {
1925                         u_int pcp;
1926
1927                         pcp = *(u_int *)addr;
1928                         if (pcp > BPF_PRIO_MAX || pcp < 0) {
1929                                 error = EINVAL;
1930                                 break;
1931                         }
1932                         d->bd_pcp = pcp;
1933                         break;
1934                 }
1935         }
1936         CURVNET_RESTORE();
1937         return (error);
1938 }
1939
1940 /*
1941  * Set d's packet filter program to fp. If this file already has a filter,
1942  * free it and replace it. Returns EINVAL for bogus requests.
1943  *
1944  * Note we use global lock here to serialize bpf_setf() and bpf_setif()
1945  * calls.
1946  */
1947 static int
1948 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
1949 {
1950 #ifdef COMPAT_FREEBSD32
1951         struct bpf_program fp_swab;
1952         struct bpf_program32 *fp32;
1953 #endif
1954         struct bpf_program_buffer *fcode;
1955         struct bpf_insn *filter;
1956 #ifdef BPF_JITTER
1957         bpf_jit_filter *jfunc;
1958 #endif
1959         size_t size;
1960         u_int flen;
1961         bool track_event;
1962
1963 #ifdef COMPAT_FREEBSD32
1964         switch (cmd) {
1965         case BIOCSETF32:
1966         case BIOCSETWF32:
1967         case BIOCSETFNR32:
1968                 fp32 = (struct bpf_program32 *)fp;
1969                 fp_swab.bf_len = fp32->bf_len;
1970                 fp_swab.bf_insns =
1971                     (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
1972                 fp = &fp_swab;
1973                 switch (cmd) {
1974                 case BIOCSETF32:
1975                         cmd = BIOCSETF;
1976                         break;
1977                 case BIOCSETWF32:
1978                         cmd = BIOCSETWF;
1979                         break;
1980                 }
1981                 break;
1982         }
1983 #endif
1984
1985         filter = NULL;
1986 #ifdef BPF_JITTER
1987         jfunc = NULL;
1988 #endif
1989         /*
1990          * Check new filter validness before acquiring any locks.
1991          * Allocate memory for new filter, if needed.
1992          */
1993         flen = fp->bf_len;
1994         if (flen > bpf_maxinsns || (fp->bf_insns == NULL && flen != 0))
1995                 return (EINVAL);
1996         size = flen * sizeof(*fp->bf_insns);
1997         if (size > 0) {
1998                 /* We're setting up new filter. Copy and check actual data. */
1999                 fcode = bpf_program_buffer_alloc(size, M_WAITOK);
2000                 filter = (struct bpf_insn *)fcode->buffer;
2001                 if (copyin(fp->bf_insns, filter, size) != 0 ||
2002                     !bpf_validate(filter, flen)) {
2003                         free(fcode, M_BPF);
2004                         return (EINVAL);
2005                 }
2006 #ifdef BPF_JITTER
2007                 if (cmd != BIOCSETWF) {
2008                         /*
2009                          * Filter is copied inside fcode and is
2010                          * perfectly valid.
2011                          */
2012                         jfunc = bpf_jitter(filter, flen);
2013                 }
2014 #endif
2015         }
2016
2017         track_event = false;
2018         fcode = NULL;
2019
2020         BPF_LOCK();
2021         BPFD_LOCK(d);
2022         /* Set up new filter. */
2023         if (cmd == BIOCSETWF) {
2024                 if (d->bd_wfilter != NULL) {
2025                         fcode = __containerof((void *)d->bd_wfilter,
2026                             struct bpf_program_buffer, buffer);
2027 #ifdef BPF_JITTER
2028                         fcode->func = NULL;
2029 #endif
2030                 }
2031                 d->bd_wfilter = filter;
2032         } else {
2033                 if (d->bd_rfilter != NULL) {
2034                         fcode = __containerof((void *)d->bd_rfilter,
2035                             struct bpf_program_buffer, buffer);
2036 #ifdef BPF_JITTER
2037                         fcode->func = d->bd_bfilter;
2038 #endif
2039                 }
2040                 d->bd_rfilter = filter;
2041 #ifdef BPF_JITTER
2042                 d->bd_bfilter = jfunc;
2043 #endif
2044                 if (cmd == BIOCSETF)
2045                         reset_d(d);
2046
2047                 if (bpf_check_upgrade(cmd, d, filter, flen) != 0) {
2048                         /*
2049                          * Filter can be set several times without
2050                          * specifying interface. In this case just mark d
2051                          * as reader.
2052                          */
2053                         d->bd_writer = 0;
2054                         if (d->bd_bif != NULL) {
2055                                 /*
2056                                  * Remove descriptor from writers-only list
2057                                  * and add it to active readers list.
2058                                  */
2059                                 CK_LIST_REMOVE(d, bd_next);
2060                                 CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist,
2061                                     d, bd_next);
2062                                 CTR2(KTR_NET,
2063                                     "%s: upgrade required by pid %d",
2064                                     __func__, d->bd_pid);
2065                                 track_event = true;
2066                         }
2067                 }
2068         }
2069         BPFD_UNLOCK(d);
2070
2071         if (fcode != NULL)
2072                 NET_EPOCH_CALL(bpf_program_buffer_free, &fcode->epoch_ctx);
2073
2074         if (track_event)
2075                 EVENTHANDLER_INVOKE(bpf_track,
2076                     d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1);
2077
2078         BPF_UNLOCK();
2079         return (0);
2080 }
2081
2082 /*
2083  * Detach a file from its current interface (if attached at all) and attach
2084  * to the interface indicated by the name stored in ifr.
2085  * Return an errno or 0.
2086  */
2087 static int
2088 bpf_setif(struct bpf_d *d, struct ifreq *ifr)
2089 {
2090         struct bpf_if *bp;
2091         struct ifnet *theywant;
2092
2093         BPF_LOCK_ASSERT();
2094
2095         theywant = ifunit(ifr->ifr_name);
2096         if (theywant == NULL || theywant->if_bpf == NULL)
2097                 return (ENXIO);
2098
2099         bp = theywant->if_bpf;
2100         /*
2101          * At this point, we expect the buffer is already allocated.  If not,
2102          * return an error.
2103          */
2104         switch (d->bd_bufmode) {
2105         case BPF_BUFMODE_BUFFER:
2106         case BPF_BUFMODE_ZBUF:
2107                 if (d->bd_sbuf == NULL)
2108                         return (EINVAL);
2109                 break;
2110
2111         default:
2112                 panic("bpf_setif: bufmode %d", d->bd_bufmode);
2113         }
2114         if (bp != d->bd_bif)
2115                 bpf_attachd(d, bp);
2116         else {
2117                 BPFD_LOCK(d);
2118                 reset_d(d);
2119                 BPFD_UNLOCK(d);
2120         }
2121         return (0);
2122 }
2123
2124 /*
2125  * Support for select() and poll() system calls
2126  *
2127  * Return true iff the specific operation will not block indefinitely.
2128  * Otherwise, return false but make a note that a selwakeup() must be done.
2129  */
2130 static int
2131 bpfpoll(struct cdev *dev, int events, struct thread *td)
2132 {
2133         struct bpf_d *d;
2134         int revents;
2135
2136         if (devfs_get_cdevpriv((void **)&d) != 0 || d->bd_bif == NULL)
2137                 return (events &
2138                     (POLLHUP|POLLIN|POLLRDNORM|POLLOUT|POLLWRNORM));
2139
2140         /*
2141          * Refresh PID associated with this descriptor.
2142          */
2143         revents = events & (POLLOUT | POLLWRNORM);
2144         BPFD_LOCK(d);
2145         BPF_PID_REFRESH(d, td);
2146         if (events & (POLLIN | POLLRDNORM)) {
2147                 if (bpf_ready(d))
2148                         revents |= events & (POLLIN | POLLRDNORM);
2149                 else {
2150                         selrecord(td, &d->bd_sel);
2151                         /* Start the read timeout if necessary. */
2152                         if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2153                                 callout_reset(&d->bd_callout, d->bd_rtout,
2154                                     bpf_timed_out, d);
2155                                 d->bd_state = BPF_WAITING;
2156                         }
2157                 }
2158         }
2159         BPFD_UNLOCK(d);
2160         return (revents);
2161 }
2162
2163 /*
2164  * Support for kevent() system call.  Register EVFILT_READ filters and
2165  * reject all others.
2166  */
2167 int
2168 bpfkqfilter(struct cdev *dev, struct knote *kn)
2169 {
2170         struct bpf_d *d;
2171
2172         if (devfs_get_cdevpriv((void **)&d) != 0)
2173                 return (1);
2174
2175         switch (kn->kn_filter) {
2176         case EVFILT_READ:
2177                 kn->kn_fop = &bpfread_filtops;
2178                 break;
2179
2180         case EVFILT_WRITE:
2181                 kn->kn_fop = &bpfwrite_filtops;
2182                 break;
2183
2184         default:
2185                 return (1);
2186         }
2187
2188         /*
2189          * Refresh PID associated with this descriptor.
2190          */
2191         BPFD_LOCK(d);
2192         BPF_PID_REFRESH_CUR(d);
2193         kn->kn_hook = d;
2194         knlist_add(&d->bd_sel.si_note, kn, 1);
2195         BPFD_UNLOCK(d);
2196
2197         return (0);
2198 }
2199
2200 static void
2201 filt_bpfdetach(struct knote *kn)
2202 {
2203         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2204
2205         knlist_remove(&d->bd_sel.si_note, kn, 0);
2206 }
2207
2208 static int
2209 filt_bpfread(struct knote *kn, long hint)
2210 {
2211         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2212         int ready;
2213
2214         BPFD_LOCK_ASSERT(d);
2215         ready = bpf_ready(d);
2216         if (ready) {
2217                 kn->kn_data = d->bd_slen;
2218                 /*
2219                  * Ignore the hold buffer if it is being copied to user space.
2220                  */
2221                 if (!d->bd_hbuf_in_use && d->bd_hbuf)
2222                         kn->kn_data += d->bd_hlen;
2223         } else if (d->bd_rtout > 0 && d->bd_state == BPF_IDLE) {
2224                 callout_reset(&d->bd_callout, d->bd_rtout,
2225                     bpf_timed_out, d);
2226                 d->bd_state = BPF_WAITING;
2227         }
2228
2229         return (ready);
2230 }
2231
2232 static int
2233 filt_bpfwrite(struct knote *kn, long hint)
2234 {
2235         struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
2236
2237         BPFD_LOCK_ASSERT(d);
2238
2239         if (d->bd_bif == NULL) {
2240                 kn->kn_data = 0;
2241                 return (0);
2242         } else {
2243                 kn->kn_data = d->bd_bif->bif_ifp->if_mtu;
2244                 return (1);
2245         }
2246 }
2247
2248 #define BPF_TSTAMP_NONE         0
2249 #define BPF_TSTAMP_FAST         1
2250 #define BPF_TSTAMP_NORMAL       2
2251 #define BPF_TSTAMP_EXTERN       3
2252
2253 static int
2254 bpf_ts_quality(int tstype)
2255 {
2256
2257         if (tstype == BPF_T_NONE)
2258                 return (BPF_TSTAMP_NONE);
2259         if ((tstype & BPF_T_FAST) != 0)
2260                 return (BPF_TSTAMP_FAST);
2261
2262         return (BPF_TSTAMP_NORMAL);
2263 }
2264
2265 static int
2266 bpf_gettime(struct bintime *bt, int tstype, struct mbuf *m)
2267 {
2268         struct m_tag *tag;
2269         int quality;
2270
2271         quality = bpf_ts_quality(tstype);
2272         if (quality == BPF_TSTAMP_NONE)
2273                 return (quality);
2274
2275         if (m != NULL) {
2276                 tag = m_tag_locate(m, MTAG_BPF, MTAG_BPF_TIMESTAMP, NULL);
2277                 if (tag != NULL) {
2278                         *bt = *(struct bintime *)(tag + 1);
2279                         return (BPF_TSTAMP_EXTERN);
2280                 }
2281         }
2282         if (quality == BPF_TSTAMP_NORMAL)
2283                 binuptime(bt);
2284         else
2285                 getbinuptime(bt);
2286
2287         return (quality);
2288 }
2289
2290 /*
2291  * Incoming linkage from device drivers.  Process the packet pkt, of length
2292  * pktlen, which is stored in a contiguous buffer.  The packet is parsed
2293  * by each process' filter, and if accepted, stashed into the corresponding
2294  * buffer.
2295  */
2296 void
2297 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
2298 {
2299         struct epoch_tracker et;
2300         struct bintime bt;
2301         struct bpf_d *d;
2302 #ifdef BPF_JITTER
2303         bpf_jit_filter *bf;
2304 #endif
2305         u_int slen;
2306         int gottime;
2307
2308         gottime = BPF_TSTAMP_NONE;
2309         NET_EPOCH_ENTER(et);
2310         CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2311                 counter_u64_add(d->bd_rcount, 1);
2312                 /*
2313                  * NB: We dont call BPF_CHECK_DIRECTION() here since there
2314                  * is no way for the caller to indiciate to us whether this
2315                  * packet is inbound or outbound. In the bpf_mtap() routines,
2316                  * we use the interface pointers on the mbuf to figure it out.
2317                  */
2318 #ifdef BPF_JITTER
2319                 bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2320                 if (bf != NULL)
2321                         slen = (*(bf->func))(pkt, pktlen, pktlen);
2322                 else
2323 #endif
2324                 slen = bpf_filter(d->bd_rfilter, pkt, pktlen, pktlen);
2325                 if (slen != 0) {
2326                         /*
2327                          * Filter matches. Let's to acquire write lock.
2328                          */
2329                         BPFD_LOCK(d);
2330                         counter_u64_add(d->bd_fcount, 1);
2331                         if (gottime < bpf_ts_quality(d->bd_tstamp))
2332                                 gottime = bpf_gettime(&bt, d->bd_tstamp,
2333                                     NULL);
2334 #ifdef MAC
2335                         if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2336 #endif
2337                                 catchpacket(d, pkt, pktlen, slen,
2338                                     bpf_append_bytes, &bt);
2339                         BPFD_UNLOCK(d);
2340                 }
2341         }
2342         NET_EPOCH_EXIT(et);
2343 }
2344
2345 #define BPF_CHECK_DIRECTION(d, r, i)                            \
2346             (((d)->bd_direction == BPF_D_IN && (r) != (i)) ||   \
2347             ((d)->bd_direction == BPF_D_OUT && (r) == (i)))
2348
2349 /*
2350  * Incoming linkage from device drivers, when packet is in an mbuf chain.
2351  * Locking model is explained in bpf_tap().
2352  */
2353 void
2354 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
2355 {
2356         struct epoch_tracker et;
2357         struct bintime bt;
2358         struct bpf_d *d;
2359 #ifdef BPF_JITTER
2360         bpf_jit_filter *bf;
2361 #endif
2362         u_int pktlen, slen;
2363         int gottime;
2364
2365         /* Skip outgoing duplicate packets. */
2366         if ((m->m_flags & M_PROMISC) != 0 && m_rcvif(m) == NULL) {
2367                 m->m_flags &= ~M_PROMISC;
2368                 return;
2369         }
2370
2371         pktlen = m_length(m, NULL);
2372         gottime = BPF_TSTAMP_NONE;
2373
2374         NET_EPOCH_ENTER(et);
2375         CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2376                 if (BPF_CHECK_DIRECTION(d, m_rcvif(m), bp->bif_ifp))
2377                         continue;
2378                 counter_u64_add(d->bd_rcount, 1);
2379 #ifdef BPF_JITTER
2380                 bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
2381                 /* XXX We cannot handle multiple mbufs. */
2382                 if (bf != NULL && m->m_next == NULL)
2383                         slen = (*(bf->func))(mtod(m, u_char *), pktlen,
2384                             pktlen);
2385                 else
2386 #endif
2387                 slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
2388                 if (slen != 0) {
2389                         BPFD_LOCK(d);
2390
2391                         counter_u64_add(d->bd_fcount, 1);
2392                         if (gottime < bpf_ts_quality(d->bd_tstamp))
2393                                 gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2394 #ifdef MAC
2395                         if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2396 #endif
2397                                 catchpacket(d, (u_char *)m, pktlen, slen,
2398                                     bpf_append_mbuf, &bt);
2399                         BPFD_UNLOCK(d);
2400                 }
2401         }
2402         NET_EPOCH_EXIT(et);
2403 }
2404
2405 /*
2406  * Incoming linkage from device drivers, when packet is in
2407  * an mbuf chain and to be prepended by a contiguous header.
2408  */
2409 void
2410 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
2411 {
2412         struct epoch_tracker et;
2413         struct bintime bt;
2414         struct mbuf mb;
2415         struct bpf_d *d;
2416         u_int pktlen, slen;
2417         int gottime;
2418
2419         /* Skip outgoing duplicate packets. */
2420         if ((m->m_flags & M_PROMISC) != 0 && m->m_pkthdr.rcvif == NULL) {
2421                 m->m_flags &= ~M_PROMISC;
2422                 return;
2423         }
2424
2425         pktlen = m_length(m, NULL);
2426         /*
2427          * Craft on-stack mbuf suitable for passing to bpf_filter.
2428          * Note that we cut corners here; we only setup what's
2429          * absolutely needed--this mbuf should never go anywhere else.
2430          */
2431         mb.m_flags = 0;
2432         mb.m_next = m;
2433         mb.m_data = data;
2434         mb.m_len = dlen;
2435         pktlen += dlen;
2436
2437         gottime = BPF_TSTAMP_NONE;
2438
2439         NET_EPOCH_ENTER(et);
2440         CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
2441                 if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
2442                         continue;
2443                 counter_u64_add(d->bd_rcount, 1);
2444                 slen = bpf_filter(d->bd_rfilter, (u_char *)&mb, pktlen, 0);
2445                 if (slen != 0) {
2446                         BPFD_LOCK(d);
2447
2448                         counter_u64_add(d->bd_fcount, 1);
2449                         if (gottime < bpf_ts_quality(d->bd_tstamp))
2450                                 gottime = bpf_gettime(&bt, d->bd_tstamp, m);
2451 #ifdef MAC
2452                         if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
2453 #endif
2454                                 catchpacket(d, (u_char *)&mb, pktlen, slen,
2455                                     bpf_append_mbuf, &bt);
2456                         BPFD_UNLOCK(d);
2457                 }
2458         }
2459         NET_EPOCH_EXIT(et);
2460 }
2461
2462 #undef  BPF_CHECK_DIRECTION
2463 #undef  BPF_TSTAMP_NONE
2464 #undef  BPF_TSTAMP_FAST
2465 #undef  BPF_TSTAMP_NORMAL
2466 #undef  BPF_TSTAMP_EXTERN
2467
2468 static int
2469 bpf_hdrlen(struct bpf_d *d)
2470 {
2471         int hdrlen;
2472
2473         hdrlen = d->bd_bif->bif_hdrlen;
2474 #ifndef BURN_BRIDGES
2475         if (d->bd_tstamp == BPF_T_NONE ||
2476             BPF_T_FORMAT(d->bd_tstamp) == BPF_T_MICROTIME)
2477 #ifdef COMPAT_FREEBSD32
2478                 if (d->bd_compat32)
2479                         hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr32);
2480                 else
2481 #endif
2482                         hdrlen += SIZEOF_BPF_HDR(struct bpf_hdr);
2483         else
2484 #endif
2485                 hdrlen += SIZEOF_BPF_HDR(struct bpf_xhdr);
2486 #ifdef COMPAT_FREEBSD32
2487         if (d->bd_compat32)
2488                 hdrlen = BPF_WORDALIGN32(hdrlen);
2489         else
2490 #endif
2491                 hdrlen = BPF_WORDALIGN(hdrlen);
2492
2493         return (hdrlen - d->bd_bif->bif_hdrlen);
2494 }
2495
2496 static void
2497 bpf_bintime2ts(struct bintime *bt, struct bpf_ts *ts, int tstype)
2498 {
2499         struct bintime bt2, boottimebin;
2500         struct timeval tsm;
2501         struct timespec tsn;
2502
2503         if ((tstype & BPF_T_MONOTONIC) == 0) {
2504                 bt2 = *bt;
2505                 getboottimebin(&boottimebin);
2506                 bintime_add(&bt2, &boottimebin);
2507                 bt = &bt2;
2508         }
2509         switch (BPF_T_FORMAT(tstype)) {
2510         case BPF_T_MICROTIME:
2511                 bintime2timeval(bt, &tsm);
2512                 ts->bt_sec = tsm.tv_sec;
2513                 ts->bt_frac = tsm.tv_usec;
2514                 break;
2515         case BPF_T_NANOTIME:
2516                 bintime2timespec(bt, &tsn);
2517                 ts->bt_sec = tsn.tv_sec;
2518                 ts->bt_frac = tsn.tv_nsec;
2519                 break;
2520         case BPF_T_BINTIME:
2521                 ts->bt_sec = bt->sec;
2522                 ts->bt_frac = bt->frac;
2523                 break;
2524         }
2525 }
2526
2527 /*
2528  * Move the packet data from interface memory (pkt) into the
2529  * store buffer.  "cpfn" is the routine called to do the actual data
2530  * transfer.  bcopy is passed in to copy contiguous chunks, while
2531  * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
2532  * pkt is really an mbuf.
2533  */
2534 static void
2535 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
2536     void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
2537     struct bintime *bt)
2538 {
2539         struct bpf_xhdr hdr;
2540 #ifndef BURN_BRIDGES
2541         struct bpf_hdr hdr_old;
2542 #ifdef COMPAT_FREEBSD32
2543         struct bpf_hdr32 hdr32_old;
2544 #endif
2545 #endif
2546         int caplen, curlen, hdrlen, totlen;
2547         int do_wakeup = 0;
2548         int do_timestamp;
2549         int tstype;
2550
2551         BPFD_LOCK_ASSERT(d);
2552         if (d->bd_bif == NULL) {
2553                 /* Descriptor was detached in concurrent thread */
2554                 counter_u64_add(d->bd_dcount, 1);
2555                 return;
2556         }
2557
2558         /*
2559          * Detect whether user space has released a buffer back to us, and if
2560          * so, move it from being a hold buffer to a free buffer.  This may
2561          * not be the best place to do it (for example, we might only want to
2562          * run this check if we need the space), but for now it's a reliable
2563          * spot to do it.
2564          */
2565         if (d->bd_fbuf == NULL && bpf_canfreebuf(d)) {
2566                 d->bd_fbuf = d->bd_hbuf;
2567                 d->bd_hbuf = NULL;
2568                 d->bd_hlen = 0;
2569                 bpf_buf_reclaimed(d);
2570         }
2571
2572         /*
2573          * Figure out how many bytes to move.  If the packet is
2574          * greater or equal to the snapshot length, transfer that
2575          * much.  Otherwise, transfer the whole packet (unless
2576          * we hit the buffer size limit).
2577          */
2578         hdrlen = bpf_hdrlen(d);
2579         totlen = hdrlen + min(snaplen, pktlen);
2580         if (totlen > d->bd_bufsize)
2581                 totlen = d->bd_bufsize;
2582
2583         /*
2584          * Round up the end of the previous packet to the next longword.
2585          *
2586          * Drop the packet if there's no room and no hope of room
2587          * If the packet would overflow the storage buffer or the storage
2588          * buffer is considered immutable by the buffer model, try to rotate
2589          * the buffer and wakeup pending processes.
2590          */
2591 #ifdef COMPAT_FREEBSD32
2592         if (d->bd_compat32)
2593                 curlen = BPF_WORDALIGN32(d->bd_slen);
2594         else
2595 #endif
2596                 curlen = BPF_WORDALIGN(d->bd_slen);
2597         if (curlen + totlen > d->bd_bufsize || !bpf_canwritebuf(d)) {
2598                 if (d->bd_fbuf == NULL) {
2599                         /*
2600                          * There's no room in the store buffer, and no
2601                          * prospect of room, so drop the packet.  Notify the
2602                          * buffer model.
2603                          */
2604                         bpf_buffull(d);
2605                         counter_u64_add(d->bd_dcount, 1);
2606                         return;
2607                 }
2608                 KASSERT(!d->bd_hbuf_in_use, ("hold buffer is in use"));
2609                 ROTATE_BUFFERS(d);
2610                 do_wakeup = 1;
2611                 curlen = 0;
2612         } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
2613                 /*
2614                  * Immediate mode is set, or the read timeout has already
2615                  * expired during a select call.  A packet arrived, so the
2616                  * reader should be woken up.
2617                  */
2618                 do_wakeup = 1;
2619         caplen = totlen - hdrlen;
2620         tstype = d->bd_tstamp;
2621         do_timestamp = tstype != BPF_T_NONE;
2622 #ifndef BURN_BRIDGES
2623         if (tstype == BPF_T_NONE || BPF_T_FORMAT(tstype) == BPF_T_MICROTIME) {
2624                 struct bpf_ts ts;
2625                 if (do_timestamp)
2626                         bpf_bintime2ts(bt, &ts, tstype);
2627 #ifdef COMPAT_FREEBSD32
2628                 if (d->bd_compat32) {
2629                         bzero(&hdr32_old, sizeof(hdr32_old));
2630                         if (do_timestamp) {
2631                                 hdr32_old.bh_tstamp.tv_sec = ts.bt_sec;
2632                                 hdr32_old.bh_tstamp.tv_usec = ts.bt_frac;
2633                         }
2634                         hdr32_old.bh_datalen = pktlen;
2635                         hdr32_old.bh_hdrlen = hdrlen;
2636                         hdr32_old.bh_caplen = caplen;
2637                         bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr32_old,
2638                             sizeof(hdr32_old));
2639                         goto copy;
2640                 }
2641 #endif
2642                 bzero(&hdr_old, sizeof(hdr_old));
2643                 if (do_timestamp) {
2644                         hdr_old.bh_tstamp.tv_sec = ts.bt_sec;
2645                         hdr_old.bh_tstamp.tv_usec = ts.bt_frac;
2646                 }
2647                 hdr_old.bh_datalen = pktlen;
2648                 hdr_old.bh_hdrlen = hdrlen;
2649                 hdr_old.bh_caplen = caplen;
2650                 bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr_old,
2651                     sizeof(hdr_old));
2652                 goto copy;
2653         }
2654 #endif
2655
2656         /*
2657          * Append the bpf header.  Note we append the actual header size, but
2658          * move forward the length of the header plus padding.
2659          */
2660         bzero(&hdr, sizeof(hdr));
2661         if (do_timestamp)
2662                 bpf_bintime2ts(bt, &hdr.bh_tstamp, tstype);
2663         hdr.bh_datalen = pktlen;
2664         hdr.bh_hdrlen = hdrlen;
2665         hdr.bh_caplen = caplen;
2666         bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
2667
2668         /*
2669          * Copy the packet data into the store buffer and update its length.
2670          */
2671 #ifndef BURN_BRIDGES
2672 copy:
2673 #endif
2674         (*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, caplen);
2675         d->bd_slen = curlen + totlen;
2676
2677         if (do_wakeup)
2678                 bpf_wakeup(d);
2679 }
2680
2681 /*
2682  * Free buffers currently in use by a descriptor.
2683  * Called on close.
2684  */
2685 static void
2686 bpfd_free(epoch_context_t ctx)
2687 {
2688         struct bpf_d *d;
2689         struct bpf_program_buffer *p;
2690
2691         /*
2692          * We don't need to lock out interrupts since this descriptor has
2693          * been detached from its interface and it yet hasn't been marked
2694          * free.
2695          */
2696         d = __containerof(ctx, struct bpf_d, epoch_ctx);
2697         bpf_free(d);
2698         if (d->bd_rfilter != NULL) {
2699                 p = __containerof((void *)d->bd_rfilter,
2700                     struct bpf_program_buffer, buffer);
2701 #ifdef BPF_JITTER
2702                 p->func = d->bd_bfilter;
2703 #endif
2704                 bpf_program_buffer_free(&p->epoch_ctx);
2705         }
2706         if (d->bd_wfilter != NULL) {
2707                 p = __containerof((void *)d->bd_wfilter,
2708                     struct bpf_program_buffer, buffer);
2709 #ifdef BPF_JITTER
2710                 p->func = NULL;
2711 #endif
2712                 bpf_program_buffer_free(&p->epoch_ctx);
2713         }
2714
2715         mtx_destroy(&d->bd_lock);
2716         counter_u64_free(d->bd_rcount);
2717         counter_u64_free(d->bd_dcount);
2718         counter_u64_free(d->bd_fcount);
2719         counter_u64_free(d->bd_wcount);
2720         counter_u64_free(d->bd_wfcount);
2721         counter_u64_free(d->bd_wdcount);
2722         counter_u64_free(d->bd_zcopy);
2723         free(d, M_BPF);
2724 }
2725
2726 /*
2727  * Attach an interface to bpf.  dlt is the link layer type; hdrlen is the
2728  * fixed size of the link header (variable length headers not yet supported).
2729  */
2730 void
2731 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
2732 {
2733
2734         bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
2735 }
2736
2737 /*
2738  * Attach an interface to bpf.  ifp is a pointer to the structure
2739  * defining the interface to be attached, dlt is the link layer type,
2740  * and hdrlen is the fixed size of the link header (variable length
2741  * headers are not yet supporrted).
2742  */
2743 void
2744 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen,
2745     struct bpf_if **driverp)
2746 {
2747         struct bpf_if *bp;
2748
2749         KASSERT(*driverp == NULL,
2750             ("bpfattach2: driverp already initialized"));
2751
2752         bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO);
2753
2754         CK_LIST_INIT(&bp->bif_dlist);
2755         CK_LIST_INIT(&bp->bif_wlist);
2756         bp->bif_ifp = ifp;
2757         bp->bif_dlt = dlt;
2758         bp->bif_hdrlen = hdrlen;
2759         bp->bif_bpf = driverp;
2760         bp->bif_refcnt = 1;
2761         *driverp = bp;
2762         /*
2763          * Reference ifnet pointer, so it won't freed until
2764          * we release it.
2765          */
2766         if_ref(ifp);
2767         BPF_LOCK();
2768         CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
2769         BPF_UNLOCK();
2770
2771         if (bootverbose && IS_DEFAULT_VNET(curvnet))
2772                 if_printf(ifp, "bpf attached\n");
2773 }
2774
2775 #ifdef VIMAGE
2776 /*
2777  * When moving interfaces between vnet instances we need a way to
2778  * query the dlt and hdrlen before detach so we can re-attch the if_bpf
2779  * after the vmove.  We unfortunately have no device driver infrastructure
2780  * to query the interface for these values after creation/attach, thus
2781  * add this as a workaround.
2782  */
2783 int
2784 bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u_int *bif_hdrlen)
2785 {
2786
2787         if (bp == NULL)
2788                 return (ENXIO);
2789         if (bif_dlt == NULL && bif_hdrlen == NULL)
2790                 return (0);
2791
2792         if (bif_dlt != NULL)
2793                 *bif_dlt = bp->bif_dlt;
2794         if (bif_hdrlen != NULL)
2795                 *bif_hdrlen = bp->bif_hdrlen;
2796
2797         return (0);
2798 }
2799 #endif
2800
2801 /*
2802  * Detach bpf from an interface. This involves detaching each descriptor
2803  * associated with the interface. Notify each descriptor as it's detached
2804  * so that any sleepers wake up and get ENXIO.
2805  */
2806 void
2807 bpfdetach(struct ifnet *ifp)
2808 {
2809         struct bpf_if *bp, *bp_temp;
2810         struct bpf_d *d;
2811
2812         BPF_LOCK();
2813         /* Find all bpf_if struct's which reference ifp and detach them. */
2814         CK_LIST_FOREACH_SAFE(bp, &bpf_iflist, bif_next, bp_temp) {
2815                 if (ifp != bp->bif_ifp)
2816                         continue;
2817
2818                 CK_LIST_REMOVE(bp, bif_next);
2819                 *bp->bif_bpf = (struct bpf_if *)&dead_bpf_if;
2820
2821                 CTR4(KTR_NET,
2822                     "%s: sheduling free for encap %d (%p) for if %p",
2823                     __func__, bp->bif_dlt, bp, ifp);
2824
2825                 /* Detach common descriptors */
2826                 while ((d = CK_LIST_FIRST(&bp->bif_dlist)) != NULL) {
2827                         bpf_detachd_locked(d, true);
2828                 }
2829
2830                 /* Detach writer-only descriptors */
2831                 while ((d = CK_LIST_FIRST(&bp->bif_wlist)) != NULL) {
2832                         bpf_detachd_locked(d, true);
2833                 }
2834                 bpfif_rele(bp);
2835         }
2836         BPF_UNLOCK();
2837 }
2838
2839 /*
2840  * Get a list of available data link type of the interface.
2841  */
2842 static int
2843 bpf_getdltlist(struct bpf_d *d, struct bpf_dltlist *bfl)
2844 {
2845         struct ifnet *ifp;
2846         struct bpf_if *bp;
2847         u_int *lst;
2848         int error, n, n1;
2849
2850         BPF_LOCK_ASSERT();
2851
2852         ifp = d->bd_bif->bif_ifp;
2853         n1 = 0;
2854         CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2855                 if (bp->bif_ifp == ifp)
2856                         n1++;
2857         }
2858         if (bfl->bfl_list == NULL) {
2859                 bfl->bfl_len = n1;
2860                 return (0);
2861         }
2862         if (n1 > bfl->bfl_len)
2863                 return (ENOMEM);
2864
2865         lst = malloc(n1 * sizeof(u_int), M_TEMP, M_WAITOK);
2866         n = 0;
2867         CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2868                 if (bp->bif_ifp != ifp)
2869                         continue;
2870                 lst[n++] = bp->bif_dlt;
2871         }
2872         error = copyout(lst, bfl->bfl_list, sizeof(u_int) * n);
2873         free(lst, M_TEMP);
2874         bfl->bfl_len = n;
2875         return (error);
2876 }
2877
2878 /*
2879  * Set the data link type of a BPF instance.
2880  */
2881 static int
2882 bpf_setdlt(struct bpf_d *d, u_int dlt)
2883 {
2884         int error, opromisc;
2885         struct ifnet *ifp;
2886         struct bpf_if *bp;
2887
2888         BPF_LOCK_ASSERT();
2889         MPASS(d->bd_bif != NULL);
2890
2891         /*
2892          * It is safe to check bd_bif without BPFD_LOCK, it can not be
2893          * changed while we hold global lock.
2894          */
2895         if (d->bd_bif->bif_dlt == dlt)
2896                 return (0);
2897
2898         ifp = d->bd_bif->bif_ifp;
2899         CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2900                 if (bp->bif_ifp == ifp && bp->bif_dlt == dlt)
2901                         break;
2902         }
2903         if (bp == NULL)
2904                 return (EINVAL);
2905
2906         opromisc = d->bd_promisc;
2907         bpf_attachd(d, bp);
2908         if (opromisc) {
2909                 error = ifpromisc(bp->bif_ifp, 1);
2910                 if (error)
2911                         if_printf(bp->bif_ifp, "%s: ifpromisc failed (%d)\n",
2912                             __func__, error);
2913                 else
2914                         d->bd_promisc = 1;
2915         }
2916         return (0);
2917 }
2918
2919 static void
2920 bpf_drvinit(void *unused)
2921 {
2922         struct cdev *dev;
2923
2924         sx_init(&bpf_sx, "bpf global lock");
2925         CK_LIST_INIT(&bpf_iflist);
2926
2927         dev = make_dev(&bpf_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "bpf");
2928         /* For compatibility */
2929         make_dev_alias(dev, "bpf0");
2930 }
2931
2932 /*
2933  * Zero out the various packet counters associated with all of the bpf
2934  * descriptors.  At some point, we will probably want to get a bit more
2935  * granular and allow the user to specify descriptors to be zeroed.
2936  */
2937 static void
2938 bpf_zero_counters(void)
2939 {
2940         struct bpf_if *bp;
2941         struct bpf_d *bd;
2942
2943         BPF_LOCK();
2944         /*
2945          * We are protected by global lock here, interfaces and
2946          * descriptors can not be deleted while we hold it.
2947          */
2948         CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
2949                 CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
2950                         counter_u64_zero(bd->bd_rcount);
2951                         counter_u64_zero(bd->bd_dcount);
2952                         counter_u64_zero(bd->bd_fcount);
2953                         counter_u64_zero(bd->bd_wcount);
2954                         counter_u64_zero(bd->bd_wfcount);
2955                         counter_u64_zero(bd->bd_zcopy);
2956                 }
2957         }
2958         BPF_UNLOCK();
2959 }
2960
2961 /*
2962  * Fill filter statistics
2963  */
2964 static void
2965 bpfstats_fill_xbpf(struct xbpf_d *d, struct bpf_d *bd)
2966 {
2967
2968         BPF_LOCK_ASSERT();
2969         bzero(d, sizeof(*d));
2970         d->bd_structsize = sizeof(*d);
2971         d->bd_immediate = bd->bd_immediate;
2972         d->bd_promisc = bd->bd_promisc;
2973         d->bd_hdrcmplt = bd->bd_hdrcmplt;
2974         d->bd_direction = bd->bd_direction;
2975         d->bd_feedback = bd->bd_feedback;
2976         d->bd_async = bd->bd_async;
2977         d->bd_rcount = counter_u64_fetch(bd->bd_rcount);
2978         d->bd_dcount = counter_u64_fetch(bd->bd_dcount);
2979         d->bd_fcount = counter_u64_fetch(bd->bd_fcount);
2980         d->bd_sig = bd->bd_sig;
2981         d->bd_slen = bd->bd_slen;
2982         d->bd_hlen = bd->bd_hlen;
2983         d->bd_bufsize = bd->bd_bufsize;
2984         d->bd_pid = bd->bd_pid;
2985         strlcpy(d->bd_ifname,
2986             bd->bd_bif->bif_ifp->if_xname, IFNAMSIZ);
2987         d->bd_locked = bd->bd_locked;
2988         d->bd_wcount = counter_u64_fetch(bd->bd_wcount);
2989         d->bd_wdcount = counter_u64_fetch(bd->bd_wdcount);
2990         d->bd_wfcount = counter_u64_fetch(bd->bd_wfcount);
2991         d->bd_zcopy = counter_u64_fetch(bd->bd_zcopy);
2992         d->bd_bufmode = bd->bd_bufmode;
2993 }
2994
2995 /*
2996  * Handle `netstat -B' stats request
2997  */
2998 static int
2999 bpf_stats_sysctl(SYSCTL_HANDLER_ARGS)
3000 {
3001         static const struct xbpf_d zerostats;
3002         struct xbpf_d *xbdbuf, *xbd, tempstats;
3003         int index, error;
3004         struct bpf_if *bp;
3005         struct bpf_d *bd;
3006
3007         /*
3008          * XXX This is not technically correct. It is possible for non
3009          * privileged users to open bpf devices. It would make sense
3010          * if the users who opened the devices were able to retrieve
3011          * the statistics for them, too.
3012          */
3013         error = priv_check(req->td, PRIV_NET_BPF);
3014         if (error)
3015                 return (error);
3016         /*
3017          * Check to see if the user is requesting that the counters be
3018          * zeroed out.  Explicitly check that the supplied data is zeroed,
3019          * as we aren't allowing the user to set the counters currently.
3020          */
3021         if (req->newptr != NULL) {
3022                 if (req->newlen != sizeof(tempstats))
3023                         return (EINVAL);
3024                 memset(&tempstats, 0, sizeof(tempstats));
3025                 error = SYSCTL_IN(req, &tempstats, sizeof(tempstats));
3026                 if (error)
3027                         return (error);
3028                 if (bcmp(&tempstats, &zerostats, sizeof(tempstats)) != 0)
3029                         return (EINVAL);
3030                 bpf_zero_counters();
3031                 return (0);
3032         }
3033         if (req->oldptr == NULL)
3034                 return (SYSCTL_OUT(req, 0, bpf_bpfd_cnt * sizeof(*xbd)));
3035         if (bpf_bpfd_cnt == 0)
3036                 return (SYSCTL_OUT(req, 0, 0));
3037         xbdbuf = malloc(req->oldlen, M_BPF, M_WAITOK);
3038         BPF_LOCK();
3039         if (req->oldlen < (bpf_bpfd_cnt * sizeof(*xbd))) {
3040                 BPF_UNLOCK();
3041                 free(xbdbuf, M_BPF);
3042                 return (ENOMEM);
3043         }
3044         index = 0;
3045         CK_LIST_FOREACH(bp, &bpf_iflist, bif_next) {
3046                 /* Send writers-only first */
3047                 CK_LIST_FOREACH(bd, &bp->bif_wlist, bd_next) {
3048                         xbd = &xbdbuf[index++];
3049                         bpfstats_fill_xbpf(xbd, bd);
3050                 }
3051                 CK_LIST_FOREACH(bd, &bp->bif_dlist, bd_next) {
3052                         xbd = &xbdbuf[index++];
3053                         bpfstats_fill_xbpf(xbd, bd);
3054                 }
3055         }
3056         BPF_UNLOCK();
3057         error = SYSCTL_OUT(req, xbdbuf, index * sizeof(*xbd));
3058         free(xbdbuf, M_BPF);
3059         return (error);
3060 }
3061
3062 SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,bpf_drvinit,NULL);
3063
3064 #else /* !DEV_BPF && !NETGRAPH_BPF */
3065
3066 /*
3067  * NOP stubs to allow bpf-using drivers to load and function.
3068  *
3069  * A 'better' implementation would allow the core bpf functionality
3070  * to be loaded at runtime.
3071  */
3072
3073 void
3074 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
3075 {
3076 }
3077
3078 void
3079 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
3080 {
3081 }
3082
3083 void
3084 bpf_mtap2(struct bpf_if *bp, void *d, u_int l, struct mbuf *m)
3085 {
3086 }
3087
3088 void
3089 bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
3090 {
3091
3092         bpfattach2(ifp, dlt, hdrlen, &ifp->if_bpf);
3093 }
3094
3095 void
3096 bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
3097 {
3098
3099         *driverp = (struct bpf_if *)&dead_bpf_if;
3100 }
3101
3102 void
3103 bpfdetach(struct ifnet *ifp)
3104 {
3105 }
3106
3107 u_int
3108 bpf_filter(const struct bpf_insn *pc, u_char *p, u_int wirelen, u_int buflen)
3109 {
3110         return -1;      /* "no filter" behaviour */
3111 }
3112
3113 int
3114 bpf_validate(const struct bpf_insn *f, int len)
3115 {
3116         return 0;               /* false */
3117 }
3118
3119 #endif /* !DEV_BPF && !NETGRAPH_BPF */
3120
3121 #ifdef DDB
3122 static void
3123 bpf_show_bpf_if(struct bpf_if *bpf_if)
3124 {
3125
3126         if (bpf_if == NULL)
3127                 return;
3128         db_printf("%p:\n", bpf_if);
3129 #define BPF_DB_PRINTF(f, e)     db_printf("   %s = " f "\n", #e, bpf_if->e);
3130         /* bif_ext.bif_next */
3131         /* bif_ext.bif_dlist */
3132         BPF_DB_PRINTF("%#x", bif_dlt);
3133         BPF_DB_PRINTF("%u", bif_hdrlen);
3134         /* bif_wlist */
3135         BPF_DB_PRINTF("%p", bif_ifp);
3136         BPF_DB_PRINTF("%p", bif_bpf);
3137         BPF_DB_PRINTF("%u", bif_refcnt);
3138 }
3139
3140 DB_SHOW_COMMAND(bpf_if, db_show_bpf_if)
3141 {
3142
3143         if (!have_addr) {
3144                 db_printf("usage: show bpf_if <struct bpf_if *>\n");
3145                 return;
3146         }
3147
3148         bpf_show_bpf_if((struct bpf_if *)addr);
3149 }
3150 #endif