sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  *
  21  * $FreeBSD$
  22  */
  23
  24 /*
  25  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  26  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  27  * Copyright (c) 2012 by Delphix. All rights reserved.
  28  */
  29
  30 /*
  31  * DTrace - Dynamic Tracing for Solaris
  32  *
  33  * This is the implementation of the Solaris Dynamic Tracing framework
  34  * (DTrace).  The user-visible interface to DTrace is described at length in
  35  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
  36  * library, the in-kernel DTrace framework, and the DTrace providers are
  37  * described in the block comments in the <sys/dtrace.h> header file.  The
  38  * internal architecture of DTrace is described in the block comments in the
  39  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
  40  * implementation very much assume mastery of all of these sources; if one has
  41  * an unanswered question about the implementation, one should consult them
  42  * first.
  43  *
  44  * The functions here are ordered roughly as follows:
  45  *
  46  *   - Probe context functions
  47  *   - Probe hashing functions
  48  *   - Non-probe context utility functions
  49  *   - Matching functions
  50  *   - Provider-to-Framework API functions
  51  *   - Probe management functions
  52  *   - DIF object functions
  53  *   - Format functions
  54  *   - Predicate functions
  55  *   - ECB functions
  56  *   - Buffer functions
  57  *   - Enabling functions
  58  *   - DOF functions
  59  *   - Anonymous enabling functions
  60  *   - Consumer state functions
  61  *   - Helper functions
  62  *   - Hook functions
  63  *   - Driver cookbook functions
  64  *
  65  * Each group of functions begins with a block comment labelled the "DTrace
  66  * [Group] Functions", allowing one to find each block by searching forward
  67  * on capital-f functions.
  68  */
  69 #include <sys/errno.h>
  70 #if !defined(sun)
  71 #include <sys/time.h>
  72 #endif
  73 #include <sys/stat.h>
  74 #include <sys/modctl.h>
  75 #include <sys/conf.h>
  76 #include <sys/systm.h>
  77 #if defined(sun)
  78 #include <sys/ddi.h>
  79 #include <sys/sunddi.h>
  80 #endif
  81 #include <sys/cpuvar.h>
  82 #include <sys/kmem.h>
  83 #if defined(sun)
  84 #include <sys/strsubr.h>
  85 #endif
  86 #include <sys/sysmacros.h>
  87 #include <sys/dtrace_impl.h>
  88 #include <sys/atomic.h>
  89 #include <sys/cmn_err.h>
  90 #if defined(sun)
  91 #include <sys/mutex_impl.h>
  92 #include <sys/rwlock_impl.h>
  93 #endif
  94 #include <sys/ctf_api.h>
  95 #if defined(sun)
  96 #include <sys/panic.h>
  97 #include <sys/priv_impl.h>
  98 #endif
  99 #include <sys/policy.h>
 100 #if defined(sun)
 101 #include <sys/cred_impl.h>
 102 #include <sys/procfs_isa.h>
 103 #endif
 104 #include <sys/taskq.h>
 105 #if defined(sun)
 106 #include <sys/mkdev.h>
 107 #include <sys/kdi.h>
 108 #endif
 109 #include <sys/zone.h>
 110 #include <sys/socket.h>
 111 #include <netinet/in.h>
 112 #include "strtolctype.h"
 113
 114 /* FreeBSD includes: */
 115 #if !defined(sun)
 116 #include <sys/callout.h>
 117 #include <sys/ctype.h>
 118 #include <sys/eventhandler.h>
 119 #include <sys/limits.h>
 120 #include <sys/kdb.h>
 121 #include <sys/kernel.h>
 122 #include <sys/malloc.h>
 123 #include <sys/sysctl.h>
 124 #include <sys/lock.h>
 125 #include <sys/mutex.h>
 126 #include <sys/rwlock.h>
 127 #include <sys/sx.h>
 128 #include <sys/dtrace_bsd.h>
 129 #include <netinet/in.h>
 130 #include "dtrace_cddl.h"
 131 #include "dtrace_debug.c"
 132 #endif
 133
 134 /*
 135  * DTrace Tunable Variables
 136  *
 137  * The following variables may be tuned by adding a line to /etc/system that
 138  * includes both the name of the DTrace module ("dtrace") and the name of the
 139  * variable.  For example:
 140  *
 141  *   set dtrace:dtrace_destructive_disallow = 1
 142  *
 143  * In general, the only variables that one should be tuning this way are those
 144  * that affect system-wide DTrace behavior, and for which the default behavior
 145  * is undesirable.  Most of these variables are tunable on a per-consumer
 146  * basis using DTrace options, and need not be tuned on a system-wide basis.
 147  * When tuning these variables, avoid pathological values; while some attempt
 148  * is made to verify the integrity of these variables, they are not considered
 149  * part of the supported interface to DTrace, and they are therefore not
 150  * checked comprehensively.  Further, these variables should not be tuned
 151  * dynamically via "mdb -kw" or other means; they should only be tuned via
 152  * /etc/system.
 153  */
 154 int             dtrace_destructive_disallow = 0;
 155 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
 156 size_t          dtrace_difo_maxsize = (256 * 1024);
 157 dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);
 158 size_t          dtrace_global_maxsize = (16 * 1024);
 159 size_t          dtrace_actions_max = (16 * 1024);
 160 size_t          dtrace_retain_max = 1024;
 161 dtrace_optval_t dtrace_helper_actions_max = 128;
 162 dtrace_optval_t dtrace_helper_providers_max = 32;
 163 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
 164 size_t          dtrace_strsize_default = 256;
 165 dtrace_optval_t dtrace_cleanrate_default = 9900990;             /* 101 hz */
 166 dtrace_optval_t dtrace_cleanrate_min = 200000;                  /* 5000 hz */
 167 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
 168 dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
 169 dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
 170 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
 171 dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
 172 dtrace_optval_t dtrace_nspec_default = 1;
 173 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
 174 dtrace_optval_t dtrace_stackframes_default = 20;
 175 dtrace_optval_t dtrace_ustackframes_default = 20;
 176 dtrace_optval_t dtrace_jstackframes_default = 50;
 177 dtrace_optval_t dtrace_jstackstrsize_default = 512;
 178 int             dtrace_msgdsize_max = 128;
 179 hrtime_t        dtrace_chill_max = MSEC2NSEC(500);              /* 500 ms */
 180 hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
 181 int             dtrace_devdepth_max = 32;
 182 int             dtrace_err_verbose;
 183 hrtime_t        dtrace_deadman_interval = NANOSEC;
 184 hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
 185 hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
 186 hrtime_t        dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
 187 #if !defined(sun)
 188 int             dtrace_memstr_max = 4096;
 189 #endif
 190
 191 /*
 192  * DTrace External Variables
 193  *
 194  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
 195  * available to DTrace consumers via the backtick (`) syntax.  One of these,
 196  * dtrace_zero, is made deliberately so:  it is provided as a source of
 197  * well-known, zero-filled memory.  While this variable is not documented,
 198  * it is used by some translators as an implementation detail.
 199  */
 200 const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
 201
 202 /*
 203  * DTrace Internal Variables
 204  */
 205 #if defined(sun)
 206 static dev_info_t       *dtrace_devi;           /* device info */
 207 #endif
 208 #if defined(sun)
 209 static vmem_t           *dtrace_arena;          /* probe ID arena */
 210 static vmem_t           *dtrace_minor;          /* minor number arena */
 211 #else
 212 static taskq_t          *dtrace_taskq;          /* task queue */
 213 static struct unrhdr    *dtrace_arena;          /* Probe ID number.     */
 214 #endif
 215 static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
 216 static int              dtrace_nprobes;         /* number of probes */
 217 static dtrace_provider_t *dtrace_provider;      /* provider list */
 218 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
 219 static int              dtrace_opens;           /* number of opens */
 220 static int              dtrace_helpers;         /* number of helpers */
 221 static int              dtrace_getf;            /* number of unpriv getf()s */
 222 #if defined(sun)
 223 static void             *dtrace_softstate;      /* softstate pointer */
 224 #endif
 225 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
 226 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
 227 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
 228 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
 229 static int              dtrace_toxranges;       /* number of toxic ranges */
 230 static int              dtrace_toxranges_max;   /* size of toxic range array */
 231 static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
 232 static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
 233 static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
 234 static kthread_t        *dtrace_panicked;       /* panicking thread */
 235 static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
 236 static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
 237 static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
 238 static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
 239 static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
 240 static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
 241 static int              dtrace_dynvar_failclean; /* dynvars failed to clean */
 242 #if !defined(sun)
 243 static struct mtx       dtrace_unr_mtx;
 244 MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
 245 int             dtrace_in_probe;        /* non-zero if executing a probe */
 246 #if defined(__i386__) || defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
 247 uintptr_t       dtrace_in_probe_addr;   /* Address of invop when already in probe */
 248 #endif
 249 static eventhandler_tag dtrace_kld_load_tag;
 250 static eventhandler_tag dtrace_kld_unload_try_tag;
 251 #endif
 252
 253 /*
 254  * DTrace Locking
 255  * DTrace is protected by three (relatively coarse-grained) locks:
 256  *
 257  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
 258  *     including enabling state, probes, ECBs, consumer state, helper state,
 259  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
 260  *     probe context is lock-free -- synchronization is handled via the
 261  *     dtrace_sync() cross call mechanism.
 262  *
 263  * (2) dtrace_provider_lock is required when manipulating provider state, or
 264  *     when provider state must be held constant.
 265  *
 266  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
 267  *     when meta provider state must be held constant.
 268  *
 269  * The lock ordering between these three locks is dtrace_meta_lock before
 270  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
 271  * several places where dtrace_provider_lock is held by the framework as it
 272  * calls into the providers -- which then call back into the framework,
 273  * grabbing dtrace_lock.)
 274  *
 275  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
 276  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
 277  * role as a coarse-grained lock; it is acquired before both of these locks.
 278  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
 279  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
 280  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
 281  * acquired _between_ dtrace_provider_lock and dtrace_lock.
 282  */
 283 static kmutex_t         dtrace_lock;            /* probe state lock */
 284 static kmutex_t         dtrace_provider_lock;   /* provider state lock */
 285 static kmutex_t         dtrace_meta_lock;       /* meta-provider state lock */
 286
 287 #if !defined(sun)
 288 /* XXX FreeBSD hacks. */
 289 #define cr_suid         cr_svuid
 290 #define cr_sgid         cr_svgid
 291 #define ipaddr_t        in_addr_t
 292 #define mod_modname     pathname
 293 #define vuprintf        vprintf
 294 #define ttoproc(_a)     ((_a)->td_proc)
 295 #define crgetzoneid(_a) 0
 296 #define NCPU            MAXCPU
 297 #define SNOCD           0
 298 #define CPU_ON_INTR(_a) 0
 299
 300 #define PRIV_EFFECTIVE          (1 << 0)
 301 #define PRIV_DTRACE_KERNEL      (1 << 1)
 302 #define PRIV_DTRACE_PROC        (1 << 2)
 303 #define PRIV_DTRACE_USER        (1 << 3)
 304 #define PRIV_PROC_OWNER         (1 << 4)
 305 #define PRIV_PROC_ZONE          (1 << 5)
 306 #define PRIV_ALL                ~0
 307
 308 SYSCTL_DECL(_debug_dtrace);
 309 SYSCTL_DECL(_kern_dtrace);
 310 #endif
 311
 312 #if defined(sun)
 313 #define curcpu  CPU->cpu_id
 314 #endif
 315
 316
 317 /*
 318  * DTrace Provider Variables
 319  *
 320  * These are the variables relating to DTrace as a provider (that is, the
 321  * provider of the BEGIN, END, and ERROR probes).
 322  */
 323 static dtrace_pattr_t   dtrace_provider_attr = {
 324 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 325 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 326 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 327 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 328 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
 329 };
 330
 331 static void
 332 dtrace_nullop(void)
 333 {}
 334
 335 static dtrace_pops_t    dtrace_provider_ops = {
 336         (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
 337         (void (*)(void *, modctl_t *))dtrace_nullop,
 338         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 339         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 340         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 341         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 342         NULL,
 343         NULL,
 344         NULL,
 345         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
 346 };
 347
 348 static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
 349 static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
 350 dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
 351
 352 /*
 353  * DTrace Helper Tracing Variables
 354  */
 355 uint32_t dtrace_helptrace_next = 0;
 356 uint32_t dtrace_helptrace_nlocals;
 357 char    *dtrace_helptrace_buffer;
 358 int     dtrace_helptrace_bufsize = 512 * 1024;
 359
 360 #ifdef DEBUG
 361 int     dtrace_helptrace_enabled = 1;
 362 #else
 363 int     dtrace_helptrace_enabled = 0;
 364 #endif
 365
 366 /*
 367  * DTrace Error Hashing
 368  *
 369  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
 370  * table.  This is very useful for checking coverage of tests that are
 371  * expected to induce DIF or DOF processing errors, and may be useful for
 372  * debugging problems in the DIF code generator or in DOF generation .  The
 373  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
 374  */
 375 #ifdef DEBUG
 376 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
 377 static const char *dtrace_errlast;
 378 static kthread_t *dtrace_errthread;
 379 static kmutex_t dtrace_errlock;
 380 #endif
 381
 382 /*
 383  * DTrace Macros and Constants
 384  *
 385  * These are various macros that are useful in various spots in the
 386  * implementation, along with a few random constants that have no meaning
 387  * outside of the implementation.  There is no real structure to this cpp
 388  * mishmash -- but is there ever?
 389  */
 390 #define DTRACE_HASHSTR(hash, probe)     \
 391         dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
 392
 393 #define DTRACE_HASHNEXT(hash, probe)    \
 394         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
 395
 396 #define DTRACE_HASHPREV(hash, probe)    \
 397         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
 398
 399 #define DTRACE_HASHEQ(hash, lhs, rhs)   \
 400         (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
 401             *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
 402
 403 #define DTRACE_AGGHASHSIZE_SLEW         17
 404
 405 #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
 406
 407 /*
 408  * The key for a thread-local variable consists of the lower 61 bits of the
 409  * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
 410  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
 411  * equal to a variable identifier.  This is necessary (but not sufficient) to
 412  * assure that global associative arrays never collide with thread-local
 413  * variables.  To guarantee that they cannot collide, we must also define the
 414  * order for keying dynamic variables.  That order is:
 415  *
 416  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
 417  *
 418  * Because the variable-key and the tls-key are in orthogonal spaces, there is
 419  * no way for a global variable key signature to match a thread-local key
 420  * signature.
 421  */
 422 #if defined(sun)
 423 #define DTRACE_TLS_THRKEY(where) { \
 424         uint_t intr = 0; \
 425         uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
 426         for (; actv; actv >>= 1) \
 427                 intr++; \
 428         ASSERT(intr < (1 << 3)); \
 429         (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
 430             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 431 }
 432 #else
 433 #define DTRACE_TLS_THRKEY(where) { \
 434         solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
 435         uint_t intr = 0; \
 436         uint_t actv = _c->cpu_intr_actv; \
 437         for (; actv; actv >>= 1) \
 438                 intr++; \
 439         ASSERT(intr < (1 << 3)); \
 440         (where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
 441             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
 442 }
 443 #endif
 444
 445 #define DT_BSWAP_8(x)   ((x) & 0xff)
 446 #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
 447 #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
 448 #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
 449
 450 #define DT_MASK_LO 0x00000000FFFFFFFFULL
 451
 452 #define DTRACE_STORE(type, tomax, offset, what) \
 453         *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
 454
 455 #ifndef __x86
 456 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
 457         if (addr & (size - 1)) {                                        \
 458                 *flags |= CPU_DTRACE_BADALIGN;                          \
 459                 cpu_core[curcpu].cpuc_dtrace_illval = addr;     \
 460                 return (0);                                             \
 461         }
 462 #else
 463 #define DTRACE_ALIGNCHECK(addr, size, flags)
 464 #endif
 465
 466 /*
 467  * Test whether a range of memory starting at testaddr of size testsz falls
 468  * within the range of memory described by addr, sz.  We take care to avoid
 469  * problems with overflow and underflow of the unsigned quantities, and
 470  * disallow all negative sizes.  Ranges of size 0 are allowed.
 471  */
 472 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
 473         ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
 474         (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
 475         (testaddr) + (testsz) >= (testaddr))
 476
 477 /*
 478  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
 479  * alloc_sz on the righthand side of the comparison in order to avoid overflow
 480  * or underflow in the comparison with it.  This is simpler than the INRANGE
 481  * check above, because we know that the dtms_scratch_ptr is valid in the
 482  * range.  Allocations of size zero are allowed.
 483  */
 484 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
 485         ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
 486         (mstate)->dtms_scratch_ptr >= (alloc_sz))
 487
 488 #define DTRACE_LOADFUNC(bits)                                           \
 489 /*CSTYLED*/                                                             \
 490 uint##bits##_t                                                          \
 491 dtrace_load##bits(uintptr_t addr)                                       \
 492 {                                                                       \
 493         size_t size = bits / NBBY;                                      \
 494         /*CSTYLED*/                                                     \
 495         uint##bits##_t rval;                                            \
 496         int i;                                                          \
 497         volatile uint16_t *flags = (volatile uint16_t *)                \
 498             &cpu_core[curcpu].cpuc_dtrace_flags;                        \
 499                                                                         \
 500         DTRACE_ALIGNCHECK(addr, size, flags);                           \
 501                                                                         \
 502         for (i = 0; i < dtrace_toxranges; i++) {                        \
 503                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
 504                         continue;                                       \
 505                                                                         \
 506                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
 507                         continue;                                       \
 508                                                                         \
 509                 /*                                                      \
 510                  * This address falls within a toxic region; return 0.  \
 511                  */                                                     \
 512                 *flags |= CPU_DTRACE_BADADDR;                           \
 513                 cpu_core[curcpu].cpuc_dtrace_illval = addr;             \
 514                 return (0);                                             \
 515         }                                                               \
 516                                                                         \
 517         *flags |= CPU_DTRACE_NOFAULT;                                   \
 518         /*CSTYLED*/                                                     \
 519         rval = *((volatile uint##bits##_t *)addr);                      \
 520         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
 521                                                                         \
 522         return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);               \
 523 }
 524
 525 #ifdef _LP64
 526 #define dtrace_loadptr  dtrace_load64
 527 #else
 528 #define dtrace_loadptr  dtrace_load32
 529 #endif
 530
 531 #define DTRACE_DYNHASH_FREE     0
 532 #define DTRACE_DYNHASH_SINK     1
 533 #define DTRACE_DYNHASH_VALID    2
 534
 535 #define DTRACE_MATCH_NEXT       0
 536 #define DTRACE_MATCH_DONE       1
 537 #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
 538 #define DTRACE_STATE_ALIGN      64
 539
 540 #define DTRACE_FLAGS2FLT(flags)                                         \
 541         (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :           \
 542         ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :                \
 543         ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :            \
 544         ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :                \
 545         ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :                \
 546         ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :         \
 547         ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :         \
 548         ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :       \
 549         ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :         \
 550         DTRACEFLT_UNKNOWN)
 551
 552 #define DTRACEACT_ISSTRING(act)                                         \
 553         ((act)->dta_kind == DTRACEACT_DIFEXPR &&                        \
 554         (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
 555
 556 /* Function prototype definitions: */
 557 static size_t dtrace_strlen(const char *, size_t);
 558 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
 559 static void dtrace_enabling_provide(dtrace_provider_t *);
 560 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
 561 static void dtrace_enabling_matchall(void);
 562 static void dtrace_enabling_reap(void);
 563 static dtrace_state_t *dtrace_anon_grab(void);
 564 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
 565     dtrace_state_t *, uint64_t, uint64_t);
 566 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
 567 static void dtrace_buffer_drop(dtrace_buffer_t *);
 568 static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
 569 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
 570     dtrace_state_t *, dtrace_mstate_t *);
 571 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
 572     dtrace_optval_t);
 573 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
 574 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
 575 uint16_t dtrace_load16(uintptr_t);
 576 uint32_t dtrace_load32(uintptr_t);
 577 uint64_t dtrace_load64(uintptr_t);
 578 uint8_t dtrace_load8(uintptr_t);
 579 void dtrace_dynvar_clean(dtrace_dstate_t *);
 580 dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
 581     size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
 582 uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
 583 static int dtrace_priv_proc(dtrace_state_t *);
 584 static void dtrace_getf_barrier(void);
 585
 586 /*
 587  * DTrace Probe Context Functions
 588  *
 589  * These functions are called from probe context.  Because probe context is
 590  * any context in which C may be called, arbitrarily locks may be held,
 591  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
 592  * As a result, functions called from probe context may only call other DTrace
 593  * support functions -- they may not interact at all with the system at large.
 594  * (Note that the ASSERT macro is made probe-context safe by redefining it in
 595  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
 596  * loads are to be performed from probe context, they _must_ be in terms of
 597  * the safe dtrace_load*() variants.
 598  *
 599  * Some functions in this block are not actually called from probe context;
 600  * for these functions, there will be a comment above the function reading
 601  * "Note:  not called from probe context."
 602  */
 603 void
 604 dtrace_panic(const char *format, ...)
 605 {
 606         va_list alist;
 607
 608         va_start(alist, format);
 609 #ifdef __FreeBSD__
 610         vpanic(format, alist);
 611 #else
 612         dtrace_vpanic(format, alist);
 613 #endif
 614         va_end(alist);
 615 }
 616
 617 int
 618 dtrace_assfail(const char *a, const char *f, int l)
 619 {
 620         dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
 621
 622         /*
 623          * We just need something here that even the most clever compiler
 624          * cannot optimize away.
 625          */
 626         return (a[(uintptr_t)f]);
 627 }
 628
 629 /*
 630  * Atomically increment a specified error counter from probe context.
 631  */
 632 static void
 633 dtrace_error(uint32_t *counter)
 634 {
 635         /*
 636          * Most counters stored to in probe context are per-CPU counters.
 637          * However, there are some error conditions that are sufficiently
 638          * arcane that they don't merit per-CPU storage.  If these counters
 639          * are incremented concurrently on different CPUs, scalability will be
 640          * adversely affected -- but we don't expect them to be white-hot in a
 641          * correctly constructed enabling...
 642          */
 643         uint32_t oval, nval;
 644
 645         do {
 646                 oval = *counter;
 647
 648                 if ((nval = oval + 1) == 0) {
 649                         /*
 650                          * If the counter would wrap, set it to 1 -- assuring
 651                          * that the counter is never zero when we have seen
 652                          * errors.  (The counter must be 32-bits because we
 653                          * aren't guaranteed a 64-bit compare&swap operation.)
 654                          * To save this code both the infamy of being fingered
 655                          * by a priggish news story and the indignity of being
 656                          * the target of a neo-puritan witch trial, we're
 657                          * carefully avoiding any colorful description of the
 658                          * likelihood of this condition -- but suffice it to
 659                          * say that it is only slightly more likely than the
 660                          * overflow of predicate cache IDs, as discussed in
 661                          * dtrace_predicate_create().
 662                          */
 663                         nval = 1;
 664                 }
 665         } while (dtrace_cas32(counter, oval, nval) != oval);
 666 }
 667
 668 /*
 669  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
 670  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
 671  */
 672 DTRACE_LOADFUNC(8)
 673 DTRACE_LOADFUNC(16)
 674 DTRACE_LOADFUNC(32)
 675 DTRACE_LOADFUNC(64)
 676
 677 static int
 678 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
 679 {
 680         if (dest < mstate->dtms_scratch_base)
 681                 return (0);
 682
 683         if (dest + size < dest)
 684                 return (0);
 685
 686         if (dest + size > mstate->dtms_scratch_ptr)
 687                 return (0);
 688
 689         return (1);
 690 }
 691
 692 static int
 693 dtrace_canstore_statvar(uint64_t addr, size_t sz,
 694     dtrace_statvar_t **svars, int nsvars)
 695 {
 696         int i;
 697
 698         for (i = 0; i < nsvars; i++) {
 699                 dtrace_statvar_t *svar = svars[i];
 700
 701                 if (svar == NULL || svar->dtsv_size == 0)
 702                         continue;
 703
 704                 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
 705                         return (1);
 706         }
 707
 708         return (0);
 709 }
 710
 711 /*
 712  * Check to see if the address is within a memory region to which a store may
 713  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
 714  * region.  The caller of dtrace_canstore() is responsible for performing any
 715  * alignment checks that are needed before stores are actually executed.
 716  */
 717 static int
 718 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 719     dtrace_vstate_t *vstate)
 720 {
 721         /*
 722          * First, check to see if the address is in scratch space...
 723          */
 724         if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
 725             mstate->dtms_scratch_size))
 726                 return (1);
 727
 728         /*
 729          * Now check to see if it's a dynamic variable.  This check will pick
 730          * up both thread-local variables and any global dynamically-allocated
 731          * variables.
 732          */
 733         if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
 734             vstate->dtvs_dynvars.dtds_size)) {
 735                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
 736                 uintptr_t base = (uintptr_t)dstate->dtds_base +
 737                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
 738                 uintptr_t chunkoffs;
 739
 740                 /*
 741                  * Before we assume that we can store here, we need to make
 742                  * sure that it isn't in our metadata -- storing to our
 743                  * dynamic variable metadata would corrupt our state.  For
 744                  * the range to not include any dynamic variable metadata,
 745                  * it must:
 746                  *
 747                  *      (1) Start above the hash table that is at the base of
 748                  *      the dynamic variable space
 749                  *
 750                  *      (2) Have a starting chunk offset that is beyond the
 751                  *      dtrace_dynvar_t that is at the base of every chunk
 752                  *
 753                  *      (3) Not span a chunk boundary
 754                  *
 755                  */
 756                 if (addr < base)
 757                         return (0);
 758
 759                 chunkoffs = (addr - base) % dstate->dtds_chunksize;
 760
 761                 if (chunkoffs < sizeof (dtrace_dynvar_t))
 762                         return (0);
 763
 764                 if (chunkoffs + sz > dstate->dtds_chunksize)
 765                         return (0);
 766
 767                 return (1);
 768         }
 769
 770         /*
 771          * Finally, check the static local and global variables.  These checks
 772          * take the longest, so we perform them last.
 773          */
 774         if (dtrace_canstore_statvar(addr, sz,
 775             vstate->dtvs_locals, vstate->dtvs_nlocals))
 776                 return (1);
 777
 778         if (dtrace_canstore_statvar(addr, sz,
 779             vstate->dtvs_globals, vstate->dtvs_nglobals))
 780                 return (1);
 781
 782         return (0);
 783 }
 784
 785
 786 /*
 787  * Convenience routine to check to see if the address is within a memory
 788  * region in which a load may be issued given the user's privilege level;
 789  * if not, it sets the appropriate error flags and loads 'addr' into the
 790  * illegal value slot.
 791  *
 792  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
 793  * appropriate memory access protection.
 794  */
 795 static int
 796 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 797     dtrace_vstate_t *vstate)
 798 {
 799         volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
 800         file_t *fp;
 801
 802         /*
 803          * If we hold the privilege to read from kernel memory, then
 804          * everything is readable.
 805          */
 806         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 807                 return (1);
 808
 809         /*
 810          * You can obviously read that which you can store.
 811          */
 812         if (dtrace_canstore(addr, sz, mstate, vstate))
 813                 return (1);
 814
 815         /*
 816          * We're allowed to read from our own string table.
 817          */
 818         if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
 819             mstate->dtms_difo->dtdo_strlen))
 820                 return (1);
 821
 822         if (vstate->dtvs_state != NULL &&
 823             dtrace_priv_proc(vstate->dtvs_state)) {
 824                 proc_t *p;
 825
 826                 /*
 827                  * When we have privileges to the current process, there are
 828                  * several context-related kernel structures that are safe to
 829                  * read, even absent the privilege to read from kernel memory.
 830                  * These reads are safe because these structures contain only
 831                  * state that (1) we're permitted to read, (2) is harmless or
 832                  * (3) contains pointers to additional kernel state that we're
 833                  * not permitted to read (and as such, do not present an
 834                  * opportunity for privilege escalation).  Finally (and
 835                  * critically), because of the nature of their relation with
 836                  * the current thread context, the memory associated with these
 837                  * structures cannot change over the duration of probe context,
 838                  * and it is therefore impossible for this memory to be
 839                  * deallocated and reallocated as something else while it's
 840                  * being operated upon.
 841                  */
 842                 if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
 843                         return (1);
 844
 845                 if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
 846                     sz, curthread->t_procp, sizeof (proc_t))) {
 847                         return (1);
 848                 }
 849
 850                 if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
 851                     curthread->t_cred, sizeof (cred_t))) {
 852                         return (1);
 853                 }
 854
 855 #if defined(sun)
 856                 if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
 857                     &(p->p_pidp->pid_id), sizeof (pid_t))) {
 858                         return (1);
 859                 }
 860
 861                 if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
 862                     curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
 863                         return (1);
 864                 }
 865 #endif
 866         }
 867
 868         if ((fp = mstate->dtms_getf) != NULL) {
 869                 uintptr_t psz = sizeof (void *);
 870                 vnode_t *vp;
 871                 vnodeops_t *op;
 872
 873                 /*
 874                  * When getf() returns a file_t, the enabling is implicitly
 875                  * granted the (transient) right to read the returned file_t
 876                  * as well as the v_path and v_op->vnop_name of the underlying
 877                  * vnode.  These accesses are allowed after a successful
 878                  * getf() because the members that they refer to cannot change
 879                  * once set -- and the barrier logic in the kernel's closef()
 880                  * path assures that the file_t and its referenced vode_t
 881                  * cannot themselves be stale (that is, it impossible for
 882                  * either dtms_getf itself or its f_vnode member to reference
 883                  * freed memory).
 884                  */
 885                 if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
 886                         return (1);
 887
 888                 if ((vp = fp->f_vnode) != NULL) {
 889 #if defined(sun)
 890                         if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
 891                                 return (1);
 892                         if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
 893                             vp->v_path, strlen(vp->v_path) + 1)) {
 894                                 return (1);
 895                         }
 896 #endif
 897
 898                         if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
 899                                 return (1);
 900
 901 #if defined(sun)
 902                         if ((op = vp->v_op) != NULL &&
 903                             DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
 904                                 return (1);
 905                         }
 906
 907                         if (op != NULL && op->vnop_name != NULL &&
 908                             DTRACE_INRANGE(addr, sz, op->vnop_name,
 909                             strlen(op->vnop_name) + 1)) {
 910                                 return (1);
 911                         }
 912 #endif
 913                 }
 914         }
 915
 916         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
 917         *illval = addr;
 918         return (0);
 919 }
 920
 921 /*
 922  * Convenience routine to check to see if a given string is within a memory
 923  * region in which a load may be issued given the user's privilege level;
 924  * this exists so that we don't need to issue unnecessary dtrace_strlen()
 925  * calls in the event that the user has all privileges.
 926  */
 927 static int
 928 dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
 929     dtrace_vstate_t *vstate)
 930 {
 931         size_t strsz;
 932
 933         /*
 934          * If we hold the privilege to read from kernel memory, then
 935          * everything is readable.
 936          */
 937         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 938                 return (1);
 939
 940         strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
 941         if (dtrace_canload(addr, strsz, mstate, vstate))
 942                 return (1);
 943
 944         return (0);
 945 }
 946
 947 /*
 948  * Convenience routine to check to see if a given variable is within a memory
 949  * region in which a load may be issued given the user's privilege level.
 950  */
 951 static int
 952 dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
 953     dtrace_vstate_t *vstate)
 954 {
 955         size_t sz;
 956         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
 957
 958         /*
 959          * If we hold the privilege to read from kernel memory, then
 960          * everything is readable.
 961          */
 962         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
 963                 return (1);
 964
 965         if (type->dtdt_kind == DIF_TYPE_STRING)
 966                 sz = dtrace_strlen(src,
 967                     vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
 968         else
 969                 sz = type->dtdt_size;
 970
 971         return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
 972 }
 973
 974 /*
 975  * Convert a string to a signed integer using safe loads.
 976  *
 977  * NOTE: This function uses various macros from strtolctype.h to manipulate
 978  * digit values, etc -- these have all been checked to ensure they make
 979  * no additional function calls.
 980  */
 981 static int64_t
 982 dtrace_strtoll(char *input, int base, size_t limit)
 983 {
 984         uintptr_t pos = (uintptr_t)input;
 985         int64_t val = 0;
 986         int x;
 987         boolean_t neg = B_FALSE;
 988         char c, cc, ccc;
 989         uintptr_t end = pos + limit;
 990
 991         /*
 992          * Consume any whitespace preceding digits.
 993          */
 994         while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
 995                 pos++;
 996
 997         /*
 998          * Handle an explicit sign if one is present.
 999          */
1000         if (c == '-' || c == '+') {
1001                 if (c == '-')
1002                         neg = B_TRUE;
1003                 c = dtrace_load8(++pos);
1004         }
1005
1006         /*
1007          * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1008          * if present.
1009          */
1010         if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1011             cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1012                 pos += 2;
1013                 c = ccc;
1014         }
1015
1016         /*
1017          * Read in contiguous digits until the first non-digit character.
1018          */
1019         for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1020             c = dtrace_load8(++pos))
1021                 val = val * base + x;
1022
1023         return (neg ? -val : val);
1024 }
1025
1026 /*
1027  * Compare two strings using safe loads.
1028  */
1029 static int
1030 dtrace_strncmp(char *s1, char *s2, size_t limit)
1031 {
1032         uint8_t c1, c2;
1033         volatile uint16_t *flags;
1034
1035         if (s1 == s2 || limit == 0)
1036                 return (0);
1037
1038         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1039
1040         do {
1041                 if (s1 == NULL) {
1042                         c1 = '\0';
1043                 } else {
1044                         c1 = dtrace_load8((uintptr_t)s1++);
1045                 }
1046
1047                 if (s2 == NULL) {
1048                         c2 = '\0';
1049                 } else {
1050                         c2 = dtrace_load8((uintptr_t)s2++);
1051                 }
1052
1053                 if (c1 != c2)
1054                         return (c1 - c2);
1055         } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1056
1057         return (0);
1058 }
1059
1060 /*
1061  * Compute strlen(s) for a string using safe memory accesses.  The additional
1062  * len parameter is used to specify a maximum length to ensure completion.
1063  */
1064 static size_t
1065 dtrace_strlen(const char *s, size_t lim)
1066 {
1067         uint_t len;
1068
1069         for (len = 0; len != lim; len++) {
1070                 if (dtrace_load8((uintptr_t)s++) == '\0')
1071                         break;
1072         }
1073
1074         return (len);
1075 }
1076
1077 /*
1078  * Check if an address falls within a toxic region.
1079  */
1080 static int
1081 dtrace_istoxic(uintptr_t kaddr, size_t size)
1082 {
1083         uintptr_t taddr, tsize;
1084         int i;
1085
1086         for (i = 0; i < dtrace_toxranges; i++) {
1087                 taddr = dtrace_toxrange[i].dtt_base;
1088                 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1089
1090                 if (kaddr - taddr < tsize) {
1091                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1092                         cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
1093                         return (1);
1094                 }
1095
1096                 if (taddr - kaddr < size) {
1097                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1098                         cpu_core[curcpu].cpuc_dtrace_illval = taddr;
1099                         return (1);
1100                 }
1101         }
1102
1103         return (0);
1104 }
1105
1106 /*
1107  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1108  * memory specified by the DIF program.  The dst is assumed to be safe memory
1109  * that we can store to directly because it is managed by DTrace.  As with
1110  * standard bcopy, overlapping copies are handled properly.
1111  */
1112 static void
1113 dtrace_bcopy(const void *src, void *dst, size_t len)
1114 {
1115         if (len != 0) {
1116                 uint8_t *s1 = dst;
1117                 const uint8_t *s2 = src;
1118
1119                 if (s1 <= s2) {
1120                         do {
1121                                 *s1++ = dtrace_load8((uintptr_t)s2++);
1122                         } while (--len != 0);
1123                 } else {
1124                         s2 += len;
1125                         s1 += len;
1126
1127                         do {
1128                                 *--s1 = dtrace_load8((uintptr_t)--s2);
1129                         } while (--len != 0);
1130                 }
1131         }
1132 }
1133
1134 /*
1135  * Copy src to dst using safe memory accesses, up to either the specified
1136  * length, or the point that a nul byte is encountered.  The src is assumed to
1137  * be unsafe memory specified by the DIF program.  The dst is assumed to be
1138  * safe memory that we can store to directly because it is managed by DTrace.
1139  * Unlike dtrace_bcopy(), overlapping regions are not handled.
1140  */
1141 static void
1142 dtrace_strcpy(const void *src, void *dst, size_t len)
1143 {
1144         if (len != 0) {
1145                 uint8_t *s1 = dst, c;
1146                 const uint8_t *s2 = src;
1147
1148                 do {
1149                         *s1++ = c = dtrace_load8((uintptr_t)s2++);
1150                 } while (--len != 0 && c != '\0');
1151         }
1152 }
1153
1154 /*
1155  * Copy src to dst, deriving the size and type from the specified (BYREF)
1156  * variable type.  The src is assumed to be unsafe memory specified by the DIF
1157  * program.  The dst is assumed to be DTrace variable memory that is of the
1158  * specified type; we assume that we can store to directly.
1159  */
1160 static void
1161 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1162 {
1163         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1164
1165         if (type->dtdt_kind == DIF_TYPE_STRING) {
1166                 dtrace_strcpy(src, dst, type->dtdt_size);
1167         } else {
1168                 dtrace_bcopy(src, dst, type->dtdt_size);
1169         }
1170 }
1171
1172 /*
1173  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1174  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1175  * safe memory that we can access directly because it is managed by DTrace.
1176  */
1177 static int
1178 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1179 {
1180         volatile uint16_t *flags;
1181
1182         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1183
1184         if (s1 == s2)
1185                 return (0);
1186
1187         if (s1 == NULL || s2 == NULL)
1188                 return (1);
1189
1190         if (s1 != s2 && len != 0) {
1191                 const uint8_t *ps1 = s1;
1192                 const uint8_t *ps2 = s2;
1193
1194                 do {
1195                         if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1196                                 return (1);
1197                 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1198         }
1199         return (0);
1200 }
1201
1202 /*
1203  * Zero the specified region using a simple byte-by-byte loop.  Note that this
1204  * is for safe DTrace-managed memory only.
1205  */
1206 static void
1207 dtrace_bzero(void *dst, size_t len)
1208 {
1209         uchar_t *cp;
1210
1211         for (cp = dst; len != 0; len--)
1212                 *cp++ = 0;
1213 }
1214
1215 static void
1216 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1217 {
1218         uint64_t result[2];
1219
1220         result[0] = addend1[0] + addend2[0];
1221         result[1] = addend1[1] + addend2[1] +
1222             (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1223
1224         sum[0] = result[0];
1225         sum[1] = result[1];
1226 }
1227
1228 /*
1229  * Shift the 128-bit value in a by b. If b is positive, shift left.
1230  * If b is negative, shift right.
1231  */
1232 static void
1233 dtrace_shift_128(uint64_t *a, int b)
1234 {
1235         uint64_t mask;
1236
1237         if (b == 0)
1238                 return;
1239
1240         if (b < 0) {
1241                 b = -b;
1242                 if (b >= 64) {
1243                         a[0] = a[1] >> (b - 64);
1244                         a[1] = 0;
1245                 } else {
1246                         a[0] >>= b;
1247                         mask = 1LL << (64 - b);
1248                         mask -= 1;
1249                         a[0] |= ((a[1] & mask) << (64 - b));
1250                         a[1] >>= b;
1251                 }
1252         } else {
1253                 if (b >= 64) {
1254                         a[1] = a[0] << (b - 64);
1255                         a[0] = 0;
1256                 } else {
1257                         a[1] <<= b;
1258                         mask = a[0] >> (64 - b);
1259                         a[1] |= mask;
1260                         a[0] <<= b;
1261                 }
1262         }
1263 }
1264
1265 /*
1266  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1267  * use native multiplication on those, and then re-combine into the
1268  * resulting 128-bit value.
1269  *
1270  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1271  *     hi1 * hi2 << 64 +
1272  *     hi1 * lo2 << 32 +
1273  *     hi2 * lo1 << 32 +
1274  *     lo1 * lo2
1275  */
1276 static void
1277 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1278 {
1279         uint64_t hi1, hi2, lo1, lo2;
1280         uint64_t tmp[2];
1281
1282         hi1 = factor1 >> 32;
1283         hi2 = factor2 >> 32;
1284
1285         lo1 = factor1 & DT_MASK_LO;
1286         lo2 = factor2 & DT_MASK_LO;
1287
1288         product[0] = lo1 * lo2;
1289         product[1] = hi1 * hi2;
1290
1291         tmp[0] = hi1 * lo2;
1292         tmp[1] = 0;
1293         dtrace_shift_128(tmp, 32);
1294         dtrace_add_128(product, tmp, product);
1295
1296         tmp[0] = hi2 * lo1;
1297         tmp[1] = 0;
1298         dtrace_shift_128(tmp, 32);
1299         dtrace_add_128(product, tmp, product);
1300 }
1301
1302 /*
1303  * This privilege check should be used by actions and subroutines to
1304  * verify that the user credentials of the process that enabled the
1305  * invoking ECB match the target credentials
1306  */
1307 static int
1308 dtrace_priv_proc_common_user(dtrace_state_t *state)
1309 {
1310         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1311
1312         /*
1313          * We should always have a non-NULL state cred here, since if cred
1314          * is null (anonymous tracing), we fast-path bypass this routine.
1315          */
1316         ASSERT(s_cr != NULL);
1317
1318         if ((cr = CRED()) != NULL &&
1319             s_cr->cr_uid == cr->cr_uid &&
1320             s_cr->cr_uid == cr->cr_ruid &&
1321             s_cr->cr_uid == cr->cr_suid &&
1322             s_cr->cr_gid == cr->cr_gid &&
1323             s_cr->cr_gid == cr->cr_rgid &&
1324             s_cr->cr_gid == cr->cr_sgid)
1325                 return (1);
1326
1327         return (0);
1328 }
1329
1330 /*
1331  * This privilege check should be used by actions and subroutines to
1332  * verify that the zone of the process that enabled the invoking ECB
1333  * matches the target credentials
1334  */
1335 static int
1336 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1337 {
1338 #if defined(sun)
1339         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1340
1341         /*
1342          * We should always have a non-NULL state cred here, since if cred
1343          * is null (anonymous tracing), we fast-path bypass this routine.
1344          */
1345         ASSERT(s_cr != NULL);
1346
1347         if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1348                 return (1);
1349
1350         return (0);
1351 #else
1352         return (1);
1353 #endif
1354 }
1355
1356 /*
1357  * This privilege check should be used by actions and subroutines to
1358  * verify that the process has not setuid or changed credentials.
1359  */
1360 static int
1361 dtrace_priv_proc_common_nocd(void)
1362 {
1363         proc_t *proc;
1364
1365         if ((proc = ttoproc(curthread)) != NULL &&
1366             !(proc->p_flag & SNOCD))
1367                 return (1);
1368
1369         return (0);
1370 }
1371
1372 static int
1373 dtrace_priv_proc_destructive(dtrace_state_t *state)
1374 {
1375         int action = state->dts_cred.dcr_action;
1376
1377         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1378             dtrace_priv_proc_common_zone(state) == 0)
1379                 goto bad;
1380
1381         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1382             dtrace_priv_proc_common_user(state) == 0)
1383                 goto bad;
1384
1385         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1386             dtrace_priv_proc_common_nocd() == 0)
1387                 goto bad;
1388
1389         return (1);
1390
1391 bad:
1392         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1393
1394         return (0);
1395 }
1396
1397 static int
1398 dtrace_priv_proc_control(dtrace_state_t *state)
1399 {
1400         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1401                 return (1);
1402
1403         if (dtrace_priv_proc_common_zone(state) &&
1404             dtrace_priv_proc_common_user(state) &&
1405             dtrace_priv_proc_common_nocd())
1406                 return (1);
1407
1408         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1409
1410         return (0);
1411 }
1412
1413 static int
1414 dtrace_priv_proc(dtrace_state_t *state)
1415 {
1416         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1417                 return (1);
1418
1419         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1420
1421         return (0);
1422 }
1423
1424 static int
1425 dtrace_priv_kernel(dtrace_state_t *state)
1426 {
1427         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1428                 return (1);
1429
1430         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1431
1432         return (0);
1433 }
1434
1435 static int
1436 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1437 {
1438         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1439                 return (1);
1440
1441         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1442
1443         return (0);
1444 }
1445
1446 /*
1447  * Determine if the dte_cond of the specified ECB allows for processing of
1448  * the current probe to continue.  Note that this routine may allow continued
1449  * processing, but with access(es) stripped from the mstate's dtms_access
1450  * field.
1451  */
1452 static int
1453 dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1454     dtrace_ecb_t *ecb)
1455 {
1456         dtrace_probe_t *probe = ecb->dte_probe;
1457         dtrace_provider_t *prov = probe->dtpr_provider;
1458         dtrace_pops_t *pops = &prov->dtpv_pops;
1459         int mode = DTRACE_MODE_NOPRIV_DROP;
1460
1461         ASSERT(ecb->dte_cond);
1462
1463 #if defined(sun)
1464         if (pops->dtps_mode != NULL) {
1465                 mode = pops->dtps_mode(prov->dtpv_arg,
1466                     probe->dtpr_id, probe->dtpr_arg);
1467
1468                 ASSERT((mode & DTRACE_MODE_USER) ||
1469                     (mode & DTRACE_MODE_KERNEL));
1470                 ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1471                     (mode & DTRACE_MODE_NOPRIV_DROP));
1472         }
1473
1474         /*
1475          * If the dte_cond bits indicate that this consumer is only allowed to
1476          * see user-mode firings of this probe, call the provider's dtps_mode()
1477          * entry point to check that the probe was fired while in a user
1478          * context.  If that's not the case, use the policy specified by the
1479          * provider to determine if we drop the probe or merely restrict
1480          * operation.
1481          */
1482         if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1483                 ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1484
1485                 if (!(mode & DTRACE_MODE_USER)) {
1486                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1487                                 return (0);
1488
1489                         mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1490                 }
1491         }
1492 #endif
1493
1494         /*
1495          * This is more subtle than it looks. We have to be absolutely certain
1496          * that CRED() isn't going to change out from under us so it's only
1497          * legit to examine that structure if we're in constrained situations.
1498          * Currently, the only times we'll this check is if a non-super-user
1499          * has enabled the profile or syscall providers -- providers that
1500          * allow visibility of all processes. For the profile case, the check
1501          * above will ensure that we're examining a user context.
1502          */
1503         if (ecb->dte_cond & DTRACE_COND_OWNER) {
1504                 cred_t *cr;
1505                 cred_t *s_cr = state->dts_cred.dcr_cred;
1506                 proc_t *proc;
1507
1508                 ASSERT(s_cr != NULL);
1509
1510                 if ((cr = CRED()) == NULL ||
1511                     s_cr->cr_uid != cr->cr_uid ||
1512                     s_cr->cr_uid != cr->cr_ruid ||
1513                     s_cr->cr_uid != cr->cr_suid ||
1514                     s_cr->cr_gid != cr->cr_gid ||
1515                     s_cr->cr_gid != cr->cr_rgid ||
1516                     s_cr->cr_gid != cr->cr_sgid ||
1517                     (proc = ttoproc(curthread)) == NULL ||
1518                     (proc->p_flag & SNOCD)) {
1519                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1520                                 return (0);
1521
1522 #if defined(sun)
1523                         mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1524 #endif
1525                 }
1526         }
1527
1528 #if defined(sun)
1529         /*
1530          * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1531          * in our zone, check to see if our mode policy is to restrict rather
1532          * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1533          * and DTRACE_ACCESS_ARGS
1534          */
1535         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1536                 cred_t *cr;
1537                 cred_t *s_cr = state->dts_cred.dcr_cred;
1538
1539                 ASSERT(s_cr != NULL);
1540
1541                 if ((cr = CRED()) == NULL ||
1542                     s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1543                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1544                                 return (0);
1545
1546                         mstate->dtms_access &=
1547                             ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1548                 }
1549         }
1550 #endif
1551
1552         return (1);
1553 }
1554
1555 /*
1556  * Note:  not called from probe context.  This function is called
1557  * asynchronously (and at a regular interval) from outside of probe context to
1558  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1559  * cleaning is explained in detail in <sys/dtrace_impl.h>.
1560  */
1561 void
1562 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1563 {
1564         dtrace_dynvar_t *dirty;
1565         dtrace_dstate_percpu_t *dcpu;
1566         dtrace_dynvar_t **rinsep;
1567         int i, j, work = 0;
1568
1569         for (i = 0; i < NCPU; i++) {
1570                 dcpu = &dstate->dtds_percpu[i];
1571                 rinsep = &dcpu->dtdsc_rinsing;
1572
1573                 /*
1574                  * If the dirty list is NULL, there is no dirty work to do.
1575                  */
1576                 if (dcpu->dtdsc_dirty == NULL)
1577                         continue;
1578
1579                 if (dcpu->dtdsc_rinsing != NULL) {
1580                         /*
1581                          * If the rinsing list is non-NULL, then it is because
1582                          * this CPU was selected to accept another CPU's
1583                          * dirty list -- and since that time, dirty buffers
1584                          * have accumulated.  This is a highly unlikely
1585                          * condition, but we choose to ignore the dirty
1586                          * buffers -- they'll be picked up a future cleanse.
1587                          */
1588                         continue;
1589                 }
1590
1591                 if (dcpu->dtdsc_clean != NULL) {
1592                         /*
1593                          * If the clean list is non-NULL, then we're in a
1594                          * situation where a CPU has done deallocations (we
1595                          * have a non-NULL dirty list) but no allocations (we
1596                          * also have a non-NULL clean list).  We can't simply
1597                          * move the dirty list into the clean list on this
1598                          * CPU, yet we also don't want to allow this condition
1599                          * to persist, lest a short clean list prevent a
1600                          * massive dirty list from being cleaned (which in
1601                          * turn could lead to otherwise avoidable dynamic
1602                          * drops).  To deal with this, we look for some CPU
1603                          * with a NULL clean list, NULL dirty list, and NULL
1604                          * rinsing list -- and then we borrow this CPU to
1605                          * rinse our dirty list.
1606                          */
1607                         for (j = 0; j < NCPU; j++) {
1608                                 dtrace_dstate_percpu_t *rinser;
1609
1610                                 rinser = &dstate->dtds_percpu[j];
1611
1612                                 if (rinser->dtdsc_rinsing != NULL)
1613                                         continue;
1614
1615                                 if (rinser->dtdsc_dirty != NULL)
1616                                         continue;
1617
1618                                 if (rinser->dtdsc_clean != NULL)
1619                                         continue;
1620
1621                                 rinsep = &rinser->dtdsc_rinsing;
1622                                 break;
1623                         }
1624
1625                         if (j == NCPU) {
1626                                 /*
1627                                  * We were unable to find another CPU that
1628                                  * could accept this dirty list -- we are
1629                                  * therefore unable to clean it now.
1630                                  */
1631                                 dtrace_dynvar_failclean++;
1632                                 continue;
1633                         }
1634                 }
1635
1636                 work = 1;
1637
1638                 /*
1639                  * Atomically move the dirty list aside.
1640                  */
1641                 do {
1642                         dirty = dcpu->dtdsc_dirty;
1643
1644                         /*
1645                          * Before we zap the dirty list, set the rinsing list.
1646                          * (This allows for a potential assertion in
1647                          * dtrace_dynvar():  if a free dynamic variable appears
1648                          * on a hash chain, either the dirty list or the
1649                          * rinsing list for some CPU must be non-NULL.)
1650                          */
1651                         *rinsep = dirty;
1652                         dtrace_membar_producer();
1653                 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1654                     dirty, NULL) != dirty);
1655         }
1656
1657         if (!work) {
1658                 /*
1659                  * We have no work to do; we can simply return.
1660                  */
1661                 return;
1662         }
1663
1664         dtrace_sync();
1665
1666         for (i = 0; i < NCPU; i++) {
1667                 dcpu = &dstate->dtds_percpu[i];
1668
1669                 if (dcpu->dtdsc_rinsing == NULL)
1670                         continue;
1671
1672                 /*
1673                  * We are now guaranteed that no hash chain contains a pointer
1674                  * into this dirty list; we can make it clean.
1675                  */
1676                 ASSERT(dcpu->dtdsc_clean == NULL);
1677                 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1678                 dcpu->dtdsc_rinsing = NULL;
1679         }
1680
1681         /*
1682          * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1683          * sure that all CPUs have seen all of the dtdsc_clean pointers.
1684          * This prevents a race whereby a CPU incorrectly decides that
1685          * the state should be something other than DTRACE_DSTATE_CLEAN
1686          * after dtrace_dynvar_clean() has completed.
1687          */
1688         dtrace_sync();
1689
1690         dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1691 }
1692
1693 /*
1694  * Depending on the value of the op parameter, this function looks-up,
1695  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1696  * allocation is requested, this function will return a pointer to a
1697  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1698  * variable can be allocated.  If NULL is returned, the appropriate counter
1699  * will be incremented.
1700  */
1701 dtrace_dynvar_t *
1702 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1703     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1704     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1705 {
1706         uint64_t hashval = DTRACE_DYNHASH_VALID;
1707         dtrace_dynhash_t *hash = dstate->dtds_hash;
1708         dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1709         processorid_t me = curcpu, cpu = me;
1710         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1711         size_t bucket, ksize;
1712         size_t chunksize = dstate->dtds_chunksize;
1713         uintptr_t kdata, lock, nstate;
1714         uint_t i;
1715
1716         ASSERT(nkeys != 0);
1717
1718         /*
1719          * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1720          * algorithm.  For the by-value portions, we perform the algorithm in
1721          * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1722          * bit, and seems to have only a minute effect on distribution.  For
1723          * the by-reference data, we perform "One-at-a-time" iterating (safely)
1724          * over each referenced byte.  It's painful to do this, but it's much
1725          * better than pathological hash distribution.  The efficacy of the
1726          * hashing algorithm (and a comparison with other algorithms) may be
1727          * found by running the ::dtrace_dynstat MDB dcmd.
1728          */
1729         for (i = 0; i < nkeys; i++) {
1730                 if (key[i].dttk_size == 0) {
1731                         uint64_t val = key[i].dttk_value;
1732
1733                         hashval += (val >> 48) & 0xffff;
1734                         hashval += (hashval << 10);
1735                         hashval ^= (hashval >> 6);
1736
1737                         hashval += (val >> 32) & 0xffff;
1738                         hashval += (hashval << 10);
1739                         hashval ^= (hashval >> 6);
1740
1741                         hashval += (val >> 16) & 0xffff;
1742                         hashval += (hashval << 10);
1743                         hashval ^= (hashval >> 6);
1744
1745                         hashval += val & 0xffff;
1746                         hashval += (hashval << 10);
1747                         hashval ^= (hashval >> 6);
1748                 } else {
1749                         /*
1750                          * This is incredibly painful, but it beats the hell
1751                          * out of the alternative.
1752                          */
1753                         uint64_t j, size = key[i].dttk_size;
1754                         uintptr_t base = (uintptr_t)key[i].dttk_value;
1755
1756                         if (!dtrace_canload(base, size, mstate, vstate))
1757                                 break;
1758
1759                         for (j = 0; j < size; j++) {
1760                                 hashval += dtrace_load8(base + j);
1761                                 hashval += (hashval << 10);
1762                                 hashval ^= (hashval >> 6);
1763                         }
1764                 }
1765         }
1766
1767         if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1768                 return (NULL);
1769
1770         hashval += (hashval << 3);
1771         hashval ^= (hashval >> 11);
1772         hashval += (hashval << 15);
1773
1774         /*
1775          * There is a remote chance (ideally, 1 in 2^31) that our hashval
1776          * comes out to be one of our two sentinel hash values.  If this
1777          * actually happens, we set the hashval to be a value known to be a
1778          * non-sentinel value.
1779          */
1780         if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1781                 hashval = DTRACE_DYNHASH_VALID;
1782
1783         /*
1784          * Yes, it's painful to do a divide here.  If the cycle count becomes
1785          * important here, tricks can be pulled to reduce it.  (However, it's
1786          * critical that hash collisions be kept to an absolute minimum;
1787          * they're much more painful than a divide.)  It's better to have a
1788          * solution that generates few collisions and still keeps things
1789          * relatively simple.
1790          */
1791         bucket = hashval % dstate->dtds_hashsize;
1792
1793         if (op == DTRACE_DYNVAR_DEALLOC) {
1794                 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1795
1796                 for (;;) {
1797                         while ((lock = *lockp) & 1)
1798                                 continue;
1799
1800                         if (dtrace_casptr((volatile void *)lockp,
1801                             (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
1802                                 break;
1803                 }
1804
1805                 dtrace_membar_producer();
1806         }
1807
1808 top:
1809         prev = NULL;
1810         lock = hash[bucket].dtdh_lock;
1811
1812         dtrace_membar_consumer();
1813
1814         start = hash[bucket].dtdh_chain;
1815         ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1816             start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1817             op != DTRACE_DYNVAR_DEALLOC));
1818
1819         for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1820                 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1821                 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1822
1823                 if (dvar->dtdv_hashval != hashval) {
1824                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1825                                 /*
1826                                  * We've reached the sink, and therefore the
1827                                  * end of the hash chain; we can kick out of
1828                                  * the loop knowing that we have seen a valid
1829                                  * snapshot of state.
1830                                  */
1831                                 ASSERT(dvar->dtdv_next == NULL);
1832                                 ASSERT(dvar == &dtrace_dynhash_sink);
1833                                 break;
1834                         }
1835
1836                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1837                                 /*
1838                                  * We've gone off the rails:  somewhere along
1839                                  * the line, one of the members of this hash
1840                                  * chain was deleted.  Note that we could also
1841                                  * detect this by simply letting this loop run
1842                                  * to completion, as we would eventually hit
1843                                  * the end of the dirty list.  However, we
1844                                  * want to avoid running the length of the
1845                                  * dirty list unnecessarily (it might be quite
1846                                  * long), so we catch this as early as
1847                                  * possible by detecting the hash marker.  In
1848                                  * this case, we simply set dvar to NULL and
1849                                  * break; the conditional after the loop will
1850                                  * send us back to top.
1851                                  */
1852                                 dvar = NULL;
1853                                 break;
1854                         }
1855
1856                         goto next;
1857                 }
1858
1859                 if (dtuple->dtt_nkeys != nkeys)
1860                         goto next;
1861
1862                 for (i = 0; i < nkeys; i++, dkey++) {
1863                         if (dkey->dttk_size != key[i].dttk_size)
1864                                 goto next; /* size or type mismatch */
1865
1866                         if (dkey->dttk_size != 0) {
1867                                 if (dtrace_bcmp(
1868                                     (void *)(uintptr_t)key[i].dttk_value,
1869                                     (void *)(uintptr_t)dkey->dttk_value,
1870                                     dkey->dttk_size))
1871                                         goto next;
1872                         } else {
1873                                 if (dkey->dttk_value != key[i].dttk_value)
1874                                         goto next;
1875                         }
1876                 }
1877
1878                 if (op != DTRACE_DYNVAR_DEALLOC)
1879                         return (dvar);
1880
1881                 ASSERT(dvar->dtdv_next == NULL ||
1882                     dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1883
1884                 if (prev != NULL) {
1885                         ASSERT(hash[bucket].dtdh_chain != dvar);
1886                         ASSERT(start != dvar);
1887                         ASSERT(prev->dtdv_next == dvar);
1888                         prev->dtdv_next = dvar->dtdv_next;
1889                 } else {
1890                         if (dtrace_casptr(&hash[bucket].dtdh_chain,
1891                             start, dvar->dtdv_next) != start) {
1892                                 /*
1893                                  * We have failed to atomically swing the
1894                                  * hash table head pointer, presumably because
1895                                  * of a conflicting allocation on another CPU.
1896                                  * We need to reread the hash chain and try
1897                                  * again.
1898                                  */
1899                                 goto top;
1900                         }
1901                 }
1902
1903                 dtrace_membar_producer();
1904
1905                 /*
1906                  * Now set the hash value to indicate that it's free.
1907                  */
1908                 ASSERT(hash[bucket].dtdh_chain != dvar);
1909                 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1910
1911                 dtrace_membar_producer();
1912
1913                 /*
1914                  * Set the next pointer to point at the dirty list, and
1915                  * atomically swing the dirty pointer to the newly freed dvar.
1916                  */
1917                 do {
1918                         next = dcpu->dtdsc_dirty;
1919                         dvar->dtdv_next = next;
1920                 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1921
1922                 /*
1923                  * Finally, unlock this hash bucket.
1924                  */
1925                 ASSERT(hash[bucket].dtdh_lock == lock);
1926                 ASSERT(lock & 1);
1927                 hash[bucket].dtdh_lock++;
1928
1929                 return (NULL);
1930 next:
1931                 prev = dvar;
1932                 continue;
1933         }
1934
1935         if (dvar == NULL) {
1936                 /*
1937                  * If dvar is NULL, it is because we went off the rails:
1938                  * one of the elements that we traversed in the hash chain
1939                  * was deleted while we were traversing it.  In this case,
1940                  * we assert that we aren't doing a dealloc (deallocs lock
1941                  * the hash bucket to prevent themselves from racing with
1942                  * one another), and retry the hash chain traversal.
1943                  */
1944                 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1945                 goto top;
1946         }
1947
1948         if (op != DTRACE_DYNVAR_ALLOC) {
1949                 /*
1950                  * If we are not to allocate a new variable, we want to
1951                  * return NULL now.  Before we return, check that the value
1952                  * of the lock word hasn't changed.  If it has, we may have
1953                  * seen an inconsistent snapshot.
1954                  */
1955                 if (op == DTRACE_DYNVAR_NOALLOC) {
1956                         if (hash[bucket].dtdh_lock != lock)
1957                                 goto top;
1958                 } else {
1959                         ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1960                         ASSERT(hash[bucket].dtdh_lock == lock);
1961                         ASSERT(lock & 1);
1962                         hash[bucket].dtdh_lock++;
1963                 }
1964
1965                 return (NULL);
1966         }
1967
1968         /*
1969          * We need to allocate a new dynamic variable.  The size we need is the
1970          * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1971          * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1972          * the size of any referred-to data (dsize).  We then round the final
1973          * size up to the chunksize for allocation.
1974          */
1975         for (ksize = 0, i = 0; i < nkeys; i++)
1976                 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1977
1978         /*
1979          * This should be pretty much impossible, but could happen if, say,
1980          * strange DIF specified the tuple.  Ideally, this should be an
1981          * assertion and not an error condition -- but that requires that the
1982          * chunksize calculation in dtrace_difo_chunksize() be absolutely
1983          * bullet-proof.  (That is, it must not be able to be fooled by
1984          * malicious DIF.)  Given the lack of backwards branches in DIF,
1985          * solving this would presumably not amount to solving the Halting
1986          * Problem -- but it still seems awfully hard.
1987          */
1988         if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1989             ksize + dsize > chunksize) {
1990                 dcpu->dtdsc_drops++;
1991                 return (NULL);
1992         }
1993
1994         nstate = DTRACE_DSTATE_EMPTY;
1995
1996         do {
1997 retry:
1998                 free = dcpu->dtdsc_free;
1999
2000                 if (free == NULL) {
2001                         dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2002                         void *rval;
2003
2004                         if (clean == NULL) {
2005                                 /*
2006                                  * We're out of dynamic variable space on
2007                                  * this CPU.  Unless we have tried all CPUs,
2008                                  * we'll try to allocate from a different
2009                                  * CPU.
2010                                  */
2011                                 switch (dstate->dtds_state) {
2012                                 case DTRACE_DSTATE_CLEAN: {
2013                                         void *sp = &dstate->dtds_state;
2014
2015                                         if (++cpu >= NCPU)
2016                                                 cpu = 0;
2017
2018                                         if (dcpu->dtdsc_dirty != NULL &&
2019                                             nstate == DTRACE_DSTATE_EMPTY)
2020                                                 nstate = DTRACE_DSTATE_DIRTY;
2021
2022                                         if (dcpu->dtdsc_rinsing != NULL)
2023                                                 nstate = DTRACE_DSTATE_RINSING;
2024
2025                                         dcpu = &dstate->dtds_percpu[cpu];
2026
2027                                         if (cpu != me)
2028                                                 goto retry;
2029
2030                                         (void) dtrace_cas32(sp,
2031                                             DTRACE_DSTATE_CLEAN, nstate);
2032
2033                                         /*
2034                                          * To increment the correct bean
2035                                          * counter, take another lap.
2036                                          */
2037                                         goto retry;
2038                                 }
2039
2040                                 case DTRACE_DSTATE_DIRTY:
2041                                         dcpu->dtdsc_dirty_drops++;
2042                                         break;
2043
2044                                 case DTRACE_DSTATE_RINSING:
2045                                         dcpu->dtdsc_rinsing_drops++;
2046                                         break;
2047
2048                                 case DTRACE_DSTATE_EMPTY:
2049                                         dcpu->dtdsc_drops++;
2050                                         break;
2051                                 }
2052
2053                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2054                                 return (NULL);
2055                         }
2056
2057                         /*
2058                          * The clean list appears to be non-empty.  We want to
2059                          * move the clean list to the free list; we start by
2060                          * moving the clean pointer aside.
2061                          */
2062                         if (dtrace_casptr(&dcpu->dtdsc_clean,
2063                             clean, NULL) != clean) {
2064                                 /*
2065                                  * We are in one of two situations:
2066                                  *
2067                                  *  (a) The clean list was switched to the
2068                                  *      free list by another CPU.
2069                                  *
2070                                  *  (b) The clean list was added to by the
2071                                  *      cleansing cyclic.
2072                                  *
2073                                  * In either of these situations, we can
2074                                  * just reattempt the free list allocation.
2075                                  */
2076                                 goto retry;
2077                         }
2078
2079                         ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2080
2081                         /*
2082                          * Now we'll move the clean list to our free list.
2083                          * It's impossible for this to fail:  the only way
2084                          * the free list can be updated is through this
2085                          * code path, and only one CPU can own the clean list.
2086                          * Thus, it would only be possible for this to fail if
2087                          * this code were racing with dtrace_dynvar_clean().
2088                          * (That is, if dtrace_dynvar_clean() updated the clean
2089                          * list, and we ended up racing to update the free
2090                          * list.)  This race is prevented by the dtrace_sync()
2091                          * in dtrace_dynvar_clean() -- which flushes the
2092                          * owners of the clean lists out before resetting
2093                          * the clean lists.
2094                          */
2095                         dcpu = &dstate->dtds_percpu[me];
2096                         rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2097                         ASSERT(rval == NULL);
2098                         goto retry;
2099                 }
2100
2101                 dvar = free;
2102                 new_free = dvar->dtdv_next;
2103         } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2104
2105         /*
2106          * We have now allocated a new chunk.  We copy the tuple keys into the
2107          * tuple array and copy any referenced key data into the data space
2108          * following the tuple array.  As we do this, we relocate dttk_value
2109          * in the final tuple to point to the key data address in the chunk.
2110          */
2111         kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2112         dvar->dtdv_data = (void *)(kdata + ksize);
2113         dvar->dtdv_tuple.dtt_nkeys = nkeys;
2114
2115         for (i = 0; i < nkeys; i++) {
2116                 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2117                 size_t kesize = key[i].dttk_size;
2118
2119                 if (kesize != 0) {
2120                         dtrace_bcopy(
2121                             (const void *)(uintptr_t)key[i].dttk_value,
2122                             (void *)kdata, kesize);
2123                         dkey->dttk_value = kdata;
2124                         kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2125                 } else {
2126                         dkey->dttk_value = key[i].dttk_value;
2127                 }
2128
2129                 dkey->dttk_size = kesize;
2130         }
2131
2132         ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2133         dvar->dtdv_hashval = hashval;
2134         dvar->dtdv_next = start;
2135
2136         if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2137                 return (dvar);
2138
2139         /*
2140          * The cas has failed.  Either another CPU is adding an element to
2141          * this hash chain, or another CPU is deleting an element from this
2142          * hash chain.  The simplest way to deal with both of these cases
2143          * (though not necessarily the most efficient) is to free our
2144          * allocated block and tail-call ourselves.  Note that the free is
2145          * to the dirty list and _not_ to the free list.  This is to prevent
2146          * races with allocators, above.
2147          */
2148         dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2149
2150         dtrace_membar_producer();
2151
2152         do {
2153                 free = dcpu->dtdsc_dirty;
2154                 dvar->dtdv_next = free;
2155         } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2156
2157         return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2158 }
2159
2160 /*ARGSUSED*/
2161 static void
2162 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2163 {
2164         if ((int64_t)nval < (int64_t)*oval)
2165                 *oval = nval;
2166 }
2167
2168 /*ARGSUSED*/
2169 static void
2170 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2171 {
2172         if ((int64_t)nval > (int64_t)*oval)
2173                 *oval = nval;
2174 }
2175
2176 static void
2177 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2178 {
2179         int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2180         int64_t val = (int64_t)nval;
2181
2182         if (val < 0) {
2183                 for (i = 0; i < zero; i++) {
2184                         if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2185                                 quanta[i] += incr;
2186                                 return;
2187                         }
2188                 }
2189         } else {
2190                 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2191                         if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2192                                 quanta[i - 1] += incr;
2193                                 return;
2194                         }
2195                 }
2196
2197                 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2198                 return;
2199         }
2200
2201         ASSERT(0);
2202 }
2203
2204 static void
2205 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2206 {
2207         uint64_t arg = *lquanta++;
2208         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2209         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2210         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2211         int32_t val = (int32_t)nval, level;
2212
2213         ASSERT(step != 0);
2214         ASSERT(levels != 0);
2215
2216         if (val < base) {
2217                 /*
2218                  * This is an underflow.
2219                  */
2220                 lquanta[0] += incr;
2221                 return;
2222         }
2223
2224         level = (val - base) / step;
2225
2226         if (level < levels) {
2227                 lquanta[level + 1] += incr;
2228                 return;
2229         }
2230
2231         /*
2232          * This is an overflow.
2233          */
2234         lquanta[levels + 1] += incr;
2235 }
2236
2237 static int
2238 dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2239     uint16_t high, uint16_t nsteps, int64_t value)
2240 {
2241         int64_t this = 1, last, next;
2242         int base = 1, order;
2243
2244         ASSERT(factor <= nsteps);
2245         ASSERT(nsteps % factor == 0);
2246
2247         for (order = 0; order < low; order++)
2248                 this *= factor;
2249
2250         /*
2251          * If our value is less than our factor taken to the power of the
2252          * low order of magnitude, it goes into the zeroth bucket.
2253          */
2254         if (value < (last = this))
2255                 return (0);
2256
2257         for (this *= factor; order <= high; order++) {
2258                 int nbuckets = this > nsteps ? nsteps : this;
2259
2260                 if ((next = this * factor) < this) {
2261                         /*
2262                          * We should not generally get log/linear quantizations
2263                          * with a high magnitude that allows 64-bits to
2264                          * overflow, but we nonetheless protect against this
2265                          * by explicitly checking for overflow, and clamping
2266                          * our value accordingly.
2267                          */
2268                         value = this - 1;
2269                 }
2270
2271                 if (value < this) {
2272                         /*
2273                          * If our value lies within this order of magnitude,
2274                          * determine its position by taking the offset within
2275                          * the order of magnitude, dividing by the bucket
2276                          * width, and adding to our (accumulated) base.
2277                          */
2278                         return (base + (value - last) / (this / nbuckets));
2279                 }
2280
2281                 base += nbuckets - (nbuckets / factor);
2282                 last = this;
2283                 this = next;
2284         }
2285
2286         /*
2287          * Our value is greater than or equal to our factor taken to the
2288          * power of one plus the high magnitude -- return the top bucket.
2289          */
2290         return (base);
2291 }
2292
2293 static void
2294 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2295 {
2296         uint64_t arg = *llquanta++;
2297         uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2298         uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2299         uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2300         uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2301
2302         llquanta[dtrace_aggregate_llquantize_bucket(factor,
2303             low, high, nsteps, nval)] += incr;
2304 }
2305
2306 /*ARGSUSED*/
2307 static void
2308 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2309 {
2310         data[0]++;
2311         data[1] += nval;
2312 }
2313
2314 /*ARGSUSED*/
2315 static void
2316 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2317 {
2318         int64_t snval = (int64_t)nval;
2319         uint64_t tmp[2];
2320
2321         data[0]++;
2322         data[1] += nval;
2323
2324         /*
2325          * What we want to say here is:
2326          *
2327          * data[2] += nval * nval;
2328          *
2329          * But given that nval is 64-bit, we could easily overflow, so
2330          * we do this as 128-bit arithmetic.
2331          */
2332         if (snval < 0)
2333                 snval = -snval;
2334
2335         dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2336         dtrace_add_128(data + 2, tmp, data + 2);
2337 }
2338
2339 /*ARGSUSED*/
2340 static void
2341 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2342 {
2343         *oval = *oval + 1;
2344 }
2345
2346 /*ARGSUSED*/
2347 static void
2348 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2349 {
2350         *oval += nval;
2351 }
2352
2353 /*
2354  * Aggregate given the tuple in the principal data buffer, and the aggregating
2355  * action denoted by the specified dtrace_aggregation_t.  The aggregation
2356  * buffer is specified as the buf parameter.  This routine does not return
2357  * failure; if there is no space in the aggregation buffer, the data will be
2358  * dropped, and a corresponding counter incremented.
2359  */
2360 static void
2361 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2362     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2363 {
2364         dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2365         uint32_t i, ndx, size, fsize;
2366         uint32_t align = sizeof (uint64_t) - 1;
2367         dtrace_aggbuffer_t *agb;
2368         dtrace_aggkey_t *key;
2369         uint32_t hashval = 0, limit, isstr;
2370         caddr_t tomax, data, kdata;
2371         dtrace_actkind_t action;
2372         dtrace_action_t *act;
2373         uintptr_t offs;
2374
2375         if (buf == NULL)
2376                 return;
2377
2378         if (!agg->dtag_hasarg) {
2379                 /*
2380                  * Currently, only quantize() and lquantize() take additional
2381                  * arguments, and they have the same semantics:  an increment
2382                  * value that defaults to 1 when not present.  If additional
2383                  * aggregating actions take arguments, the setting of the
2384                  * default argument value will presumably have to become more
2385                  * sophisticated...
2386                  */
2387                 arg = 1;
2388         }
2389
2390         action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2391         size = rec->dtrd_offset - agg->dtag_base;
2392         fsize = size + rec->dtrd_size;
2393
2394         ASSERT(dbuf->dtb_tomax != NULL);
2395         data = dbuf->dtb_tomax + offset + agg->dtag_base;
2396
2397         if ((tomax = buf->dtb_tomax) == NULL) {
2398                 dtrace_buffer_drop(buf);
2399                 return;
2400         }
2401
2402         /*
2403          * The metastructure is always at the bottom of the buffer.
2404          */
2405         agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2406             sizeof (dtrace_aggbuffer_t));
2407
2408         if (buf->dtb_offset == 0) {
2409                 /*
2410                  * We just kludge up approximately 1/8th of the size to be
2411                  * buckets.  If this guess ends up being routinely
2412                  * off-the-mark, we may need to dynamically readjust this
2413                  * based on past performance.
2414                  */
2415                 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2416
2417                 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2418                     (uintptr_t)tomax || hashsize == 0) {
2419                         /*
2420                          * We've been given a ludicrously small buffer;
2421                          * increment our drop count and leave.
2422                          */
2423                         dtrace_buffer_drop(buf);
2424                         return;
2425                 }
2426
2427                 /*
2428                  * And now, a pathetic attempt to try to get a an odd (or
2429                  * perchance, a prime) hash size for better hash distribution.
2430                  */
2431                 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2432                         hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2433
2434                 agb->dtagb_hashsize = hashsize;
2435                 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2436                     agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2437                 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2438
2439                 for (i = 0; i < agb->dtagb_hashsize; i++)
2440                         agb->dtagb_hash[i] = NULL;
2441         }
2442
2443         ASSERT(agg->dtag_first != NULL);
2444         ASSERT(agg->dtag_first->dta_intuple);
2445
2446         /*
2447          * Calculate the hash value based on the key.  Note that we _don't_
2448          * include the aggid in the hashing (but we will store it as part of
2449          * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2450          * algorithm: a simple, quick algorithm that has no known funnels, and
2451          * gets good distribution in practice.  The efficacy of the hashing
2452          * algorithm (and a comparison with other algorithms) may be found by
2453          * running the ::dtrace_aggstat MDB dcmd.
2454          */
2455         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2456                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2457                 limit = i + act->dta_rec.dtrd_size;
2458                 ASSERT(limit <= size);
2459                 isstr = DTRACEACT_ISSTRING(act);
2460
2461                 for (; i < limit; i++) {
2462                         hashval += data[i];
2463                         hashval += (hashval << 10);
2464                         hashval ^= (hashval >> 6);
2465
2466                         if (isstr && data[i] == '\0')
2467                                 break;
2468                 }
2469         }
2470
2471         hashval += (hashval << 3);
2472         hashval ^= (hashval >> 11);
2473         hashval += (hashval << 15);
2474
2475         /*
2476          * Yes, the divide here is expensive -- but it's generally the least
2477          * of the performance issues given the amount of data that we iterate
2478          * over to compute hash values, compare data, etc.
2479          */
2480         ndx = hashval % agb->dtagb_hashsize;
2481
2482         for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2483                 ASSERT((caddr_t)key >= tomax);
2484                 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2485
2486                 if (hashval != key->dtak_hashval || key->dtak_size != size)
2487                         continue;
2488
2489                 kdata = key->dtak_data;
2490                 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2491
2492                 for (act = agg->dtag_first; act->dta_intuple;
2493                     act = act->dta_next) {
2494                         i = act->dta_rec.dtrd_offset - agg->dtag_base;
2495                         limit = i + act->dta_rec.dtrd_size;
2496                         ASSERT(limit <= size);
2497                         isstr = DTRACEACT_ISSTRING(act);
2498
2499                         for (; i < limit; i++) {
2500                                 if (kdata[i] != data[i])
2501                                         goto next;
2502
2503                                 if (isstr && data[i] == '\0')
2504                                         break;
2505                         }
2506                 }
2507
2508                 if (action != key->dtak_action) {
2509                         /*
2510                          * We are aggregating on the same value in the same
2511                          * aggregation with two different aggregating actions.
2512                          * (This should have been picked up in the compiler,
2513                          * so we may be dealing with errant or devious DIF.)
2514                          * This is an error condition; we indicate as much,
2515                          * and return.
2516                          */
2517                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2518                         return;
2519                 }
2520
2521                 /*
2522                  * This is a hit:  we need to apply the aggregator to
2523                  * the value at this key.
2524                  */
2525                 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2526                 return;
2527 next:
2528                 continue;
2529         }
2530
2531         /*
2532          * We didn't find it.  We need to allocate some zero-filled space,
2533          * link it into the hash table appropriately, and apply the aggregator
2534          * to the (zero-filled) value.
2535          */
2536         offs = buf->dtb_offset;
2537         while (offs & (align - 1))
2538                 offs += sizeof (uint32_t);
2539
2540         /*
2541          * If we don't have enough room to both allocate a new key _and_
2542          * its associated data, increment the drop count and return.
2543          */
2544         if ((uintptr_t)tomax + offs + fsize >
2545             agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2546                 dtrace_buffer_drop(buf);
2547                 return;
2548         }
2549
2550         /*CONSTCOND*/
2551         ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2552         key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2553         agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2554
2555         key->dtak_data = kdata = tomax + offs;
2556         buf->dtb_offset = offs + fsize;
2557
2558         /*
2559          * Now copy the data across.
2560          */
2561         *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2562
2563         for (i = sizeof (dtrace_aggid_t); i < size; i++)
2564                 kdata[i] = data[i];
2565
2566         /*
2567          * Because strings are not zeroed out by default, we need to iterate
2568          * looking for actions that store strings, and we need to explicitly
2569          * pad these strings out with zeroes.
2570          */
2571         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2572                 int nul;
2573
2574                 if (!DTRACEACT_ISSTRING(act))
2575                         continue;
2576
2577                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2578                 limit = i + act->dta_rec.dtrd_size;
2579                 ASSERT(limit <= size);
2580
2581                 for (nul = 0; i < limit; i++) {
2582                         if (nul) {
2583                                 kdata[i] = '\0';
2584                                 continue;
2585                         }
2586
2587                         if (data[i] != '\0')
2588                                 continue;
2589
2590                         nul = 1;
2591                 }
2592         }
2593
2594         for (i = size; i < fsize; i++)
2595                 kdata[i] = 0;
2596
2597         key->dtak_hashval = hashval;
2598         key->dtak_size = size;
2599         key->dtak_action = action;
2600         key->dtak_next = agb->dtagb_hash[ndx];
2601         agb->dtagb_hash[ndx] = key;
2602
2603         /*
2604          * Finally, apply the aggregator.
2605          */
2606         *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2607         agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2608 }
2609
2610 /*
2611  * Given consumer state, this routine finds a speculation in the INACTIVE
2612  * state and transitions it into the ACTIVE state.  If there is no speculation
2613  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2614  * incremented -- it is up to the caller to take appropriate action.
2615  */
2616 static int
2617 dtrace_speculation(dtrace_state_t *state)
2618 {
2619         int i = 0;
2620         dtrace_speculation_state_t current;
2621         uint32_t *stat = &state->dts_speculations_unavail, count;
2622
2623         while (i < state->dts_nspeculations) {
2624                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2625
2626                 current = spec->dtsp_state;
2627
2628                 if (current != DTRACESPEC_INACTIVE) {
2629                         if (current == DTRACESPEC_COMMITTINGMANY ||
2630                             current == DTRACESPEC_COMMITTING ||
2631                             current == DTRACESPEC_DISCARDING)
2632                                 stat = &state->dts_speculations_busy;
2633                         i++;
2634                         continue;
2635                 }
2636
2637                 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2638                     current, DTRACESPEC_ACTIVE) == current)
2639                         return (i + 1);
2640         }
2641
2642         /*
2643          * We couldn't find a speculation.  If we found as much as a single
2644          * busy speculation buffer, we'll attribute this failure as "busy"
2645          * instead of "unavail".
2646          */
2647         do {
2648                 count = *stat;
2649         } while (dtrace_cas32(stat, count, count + 1) != count);
2650
2651         return (0);
2652 }
2653
2654 /*
2655  * This routine commits an active speculation.  If the specified speculation
2656  * is not in a valid state to perform a commit(), this routine will silently do
2657  * nothing.  The state of the specified speculation is transitioned according
2658  * to the state transition diagram outlined in <sys/dtrace_impl.h>
2659  */
2660 static void
2661 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2662     dtrace_specid_t which)
2663 {
2664         dtrace_speculation_t *spec;
2665         dtrace_buffer_t *src, *dest;
2666         uintptr_t daddr, saddr, dlimit, slimit;
2667         dtrace_speculation_state_t current, new = 0;
2668         intptr_t offs;
2669         uint64_t timestamp;
2670
2671         if (which == 0)
2672                 return;
2673
2674         if (which > state->dts_nspeculations) {
2675                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2676                 return;
2677         }
2678
2679         spec = &state->dts_speculations[which - 1];
2680         src = &spec->dtsp_buffer[cpu];
2681         dest = &state->dts_buffer[cpu];
2682
2683         do {
2684                 current = spec->dtsp_state;
2685
2686                 if (current == DTRACESPEC_COMMITTINGMANY)
2687                         break;
2688
2689                 switch (current) {
2690                 case DTRACESPEC_INACTIVE:
2691                 case DTRACESPEC_DISCARDING:
2692                         return;
2693
2694                 case DTRACESPEC_COMMITTING:
2695                         /*
2696                          * This is only possible if we are (a) commit()'ing
2697                          * without having done a prior speculate() on this CPU
2698                          * and (b) racing with another commit() on a different
2699                          * CPU.  There's nothing to do -- we just assert that
2700                          * our offset is 0.
2701                          */
2702                         ASSERT(src->dtb_offset == 0);
2703                         return;
2704
2705                 case DTRACESPEC_ACTIVE:
2706                         new = DTRACESPEC_COMMITTING;
2707                         break;
2708
2709                 case DTRACESPEC_ACTIVEONE:
2710                         /*
2711                          * This speculation is active on one CPU.  If our
2712                          * buffer offset is non-zero, we know that the one CPU
2713                          * must be us.  Otherwise, we are committing on a
2714                          * different CPU from the speculate(), and we must
2715                          * rely on being asynchronously cleaned.
2716                          */
2717                         if (src->dtb_offset != 0) {
2718                                 new = DTRACESPEC_COMMITTING;
2719                                 break;
2720                         }
2721                         /*FALLTHROUGH*/
2722
2723                 case DTRACESPEC_ACTIVEMANY:
2724                         new = DTRACESPEC_COMMITTINGMANY;
2725                         break;
2726
2727                 default:
2728                         ASSERT(0);
2729                 }
2730         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2731             current, new) != current);
2732
2733         /*
2734          * We have set the state to indicate that we are committing this
2735          * speculation.  Now reserve the necessary space in the destination
2736          * buffer.
2737          */
2738         if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2739             sizeof (uint64_t), state, NULL)) < 0) {
2740                 dtrace_buffer_drop(dest);
2741                 goto out;
2742         }
2743
2744         /*
2745          * We have sufficient space to copy the speculative buffer into the
2746          * primary buffer.  First, modify the speculative buffer, filling
2747          * in the timestamp of all entries with the current time.  The data
2748          * must have the commit() time rather than the time it was traced,
2749          * so that all entries in the primary buffer are in timestamp order.
2750          */
2751         timestamp = dtrace_gethrtime();
2752         saddr = (uintptr_t)src->dtb_tomax;
2753         slimit = saddr + src->dtb_offset;
2754         while (saddr < slimit) {
2755                 size_t size;
2756                 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2757
2758                 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2759                         saddr += sizeof (dtrace_epid_t);
2760                         continue;
2761                 }
2762                 ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2763                 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2764
2765                 ASSERT3U(saddr + size, <=, slimit);
2766                 ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2767                 ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2768
2769                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2770
2771                 saddr += size;
2772         }
2773
2774         /*
2775          * Copy the buffer across.  (Note that this is a
2776          * highly subobtimal bcopy(); in the unlikely event that this becomes
2777          * a serious performance issue, a high-performance DTrace-specific
2778          * bcopy() should obviously be invented.)
2779          */
2780         daddr = (uintptr_t)dest->dtb_tomax + offs;
2781         dlimit = daddr + src->dtb_offset;
2782         saddr = (uintptr_t)src->dtb_tomax;
2783
2784         /*
2785          * First, the aligned portion.
2786          */
2787         while (dlimit - daddr >= sizeof (uint64_t)) {
2788                 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2789
2790                 daddr += sizeof (uint64_t);
2791                 saddr += sizeof (uint64_t);
2792         }
2793
2794         /*
2795          * Now any left-over bit...
2796          */
2797         while (dlimit - daddr)
2798                 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2799
2800         /*
2801          * Finally, commit the reserved space in the destination buffer.
2802          */
2803         dest->dtb_offset = offs + src->dtb_offset;
2804
2805 out:
2806         /*
2807          * If we're lucky enough to be the only active CPU on this speculation
2808          * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2809          */
2810         if (current == DTRACESPEC_ACTIVE ||
2811             (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2812                 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2813                     DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2814
2815                 ASSERT(rval == DTRACESPEC_COMMITTING);
2816         }
2817
2818         src->dtb_offset = 0;
2819         src->dtb_xamot_drops += src->dtb_drops;
2820         src->dtb_drops = 0;
2821 }
2822
2823 /*
2824  * This routine discards an active speculation.  If the specified speculation
2825  * is not in a valid state to perform a discard(), this routine will silently
2826  * do nothing.  The state of the specified speculation is transitioned
2827  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2828  */
2829 static void
2830 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2831     dtrace_specid_t which)
2832 {
2833         dtrace_speculation_t *spec;
2834         dtrace_speculation_state_t current, new = 0;
2835         dtrace_buffer_t *buf;
2836
2837         if (which == 0)
2838                 return;
2839
2840         if (which > state->dts_nspeculations) {
2841                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2842                 return;
2843         }
2844
2845         spec = &state->dts_speculations[which - 1];
2846         buf = &spec->dtsp_buffer[cpu];
2847
2848         do {
2849                 current = spec->dtsp_state;
2850
2851                 switch (current) {
2852                 case DTRACESPEC_INACTIVE:
2853                 case DTRACESPEC_COMMITTINGMANY:
2854                 case DTRACESPEC_COMMITTING:
2855                 case DTRACESPEC_DISCARDING:
2856                         return;
2857
2858                 case DTRACESPEC_ACTIVE:
2859                 case DTRACESPEC_ACTIVEMANY:
2860                         new = DTRACESPEC_DISCARDING;
2861                         break;
2862
2863                 case DTRACESPEC_ACTIVEONE:
2864                         if (buf->dtb_offset != 0) {
2865                                 new = DTRACESPEC_INACTIVE;
2866                         } else {
2867                                 new = DTRACESPEC_DISCARDING;
2868                         }
2869                         break;
2870
2871                 default:
2872                         ASSERT(0);
2873                 }
2874         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2875             current, new) != current);
2876
2877         buf->dtb_offset = 0;
2878         buf->dtb_drops = 0;
2879 }
2880
2881 /*
2882  * Note:  not called from probe context.  This function is called
2883  * asynchronously from cross call context to clean any speculations that are
2884  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2885  * transitioned back to the INACTIVE state until all CPUs have cleaned the
2886  * speculation.
2887  */
2888 static void
2889 dtrace_speculation_clean_here(dtrace_state_t *state)
2890 {
2891         dtrace_icookie_t cookie;
2892         processorid_t cpu = curcpu;
2893         dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2894         dtrace_specid_t i;
2895
2896         cookie = dtrace_interrupt_disable();
2897
2898         if (dest->dtb_tomax == NULL) {
2899                 dtrace_interrupt_enable(cookie);
2900                 return;
2901         }
2902
2903         for (i = 0; i < state->dts_nspeculations; i++) {
2904                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2905                 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2906
2907                 if (src->dtb_tomax == NULL)
2908                         continue;
2909
2910                 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2911                         src->dtb_offset = 0;
2912                         continue;
2913                 }
2914
2915                 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2916                         continue;
2917
2918                 if (src->dtb_offset == 0)
2919                         continue;
2920
2921                 dtrace_speculation_commit(state, cpu, i + 1);
2922         }
2923
2924         dtrace_interrupt_enable(cookie);
2925 }
2926
2927 /*
2928  * Note:  not called from probe context.  This function is called
2929  * asynchronously (and at a regular interval) to clean any speculations that
2930  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2931  * is work to be done, it cross calls all CPUs to perform that work;
2932  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2933  * INACTIVE state until they have been cleaned by all CPUs.
2934  */
2935 static void
2936 dtrace_speculation_clean(dtrace_state_t *state)
2937 {
2938         int work = 0, rv;
2939         dtrace_specid_t i;
2940
2941         for (i = 0; i < state->dts_nspeculations; i++) {
2942                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2943
2944                 ASSERT(!spec->dtsp_cleaning);
2945
2946                 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2947                     spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2948                         continue;
2949
2950                 work++;
2951                 spec->dtsp_cleaning = 1;
2952         }
2953
2954         if (!work)
2955                 return;
2956
2957         dtrace_xcall(DTRACE_CPUALL,
2958             (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2959
2960         /*
2961          * We now know that all CPUs have committed or discarded their
2962          * speculation buffers, as appropriate.  We can now set the state
2963          * to inactive.
2964          */
2965         for (i = 0; i < state->dts_nspeculations; i++) {
2966                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2967                 dtrace_speculation_state_t current, new;
2968
2969                 if (!spec->dtsp_cleaning)
2970                         continue;
2971
2972                 current = spec->dtsp_state;
2973                 ASSERT(current == DTRACESPEC_DISCARDING ||
2974                     current == DTRACESPEC_COMMITTINGMANY);
2975
2976                 new = DTRACESPEC_INACTIVE;
2977
2978                 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2979                 ASSERT(rv == current);
2980                 spec->dtsp_cleaning = 0;
2981         }
2982 }
2983
2984 /*
2985  * Called as part of a speculate() to get the speculative buffer associated
2986  * with a given speculation.  Returns NULL if the specified speculation is not
2987  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2988  * the active CPU is not the specified CPU -- the speculation will be
2989  * atomically transitioned into the ACTIVEMANY state.
2990  */
2991 static dtrace_buffer_t *
2992 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2993     dtrace_specid_t which)
2994 {
2995         dtrace_speculation_t *spec;
2996         dtrace_speculation_state_t current, new = 0;
2997         dtrace_buffer_t *buf;
2998
2999         if (which == 0)
3000                 return (NULL);
3001
3002         if (which > state->dts_nspeculations) {
3003                 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3004                 return (NULL);
3005         }
3006
3007         spec = &state->dts_speculations[which - 1];
3008         buf = &spec->dtsp_buffer[cpuid];
3009
3010         do {
3011                 current = spec->dtsp_state;
3012
3013                 switch (current) {
3014                 case DTRACESPEC_INACTIVE:
3015                 case DTRACESPEC_COMMITTINGMANY:
3016                 case DTRACESPEC_DISCARDING:
3017                         return (NULL);
3018
3019                 case DTRACESPEC_COMMITTING:
3020                         ASSERT(buf->dtb_offset == 0);
3021                         return (NULL);
3022
3023                 case DTRACESPEC_ACTIVEONE:
3024                         /*
3025                          * This speculation is currently active on one CPU.
3026                          * Check the offset in the buffer; if it's non-zero,
3027                          * that CPU must be us (and we leave the state alone).
3028                          * If it's zero, assume that we're starting on a new
3029                          * CPU -- and change the state to indicate that the
3030                          * speculation is active on more than one CPU.
3031                          */
3032                         if (buf->dtb_offset != 0)
3033                                 return (buf);
3034
3035                         new = DTRACESPEC_ACTIVEMANY;
3036                         break;
3037
3038                 case DTRACESPEC_ACTIVEMANY:
3039                         return (buf);
3040
3041                 case DTRACESPEC_ACTIVE:
3042                         new = DTRACESPEC_ACTIVEONE;
3043                         break;
3044
3045                 default:
3046                         ASSERT(0);
3047                 }
3048         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3049             current, new) != current);
3050
3051         ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3052         return (buf);
3053 }
3054
3055 /*
3056  * Return a string.  In the event that the user lacks the privilege to access
3057  * arbitrary kernel memory, we copy the string out to scratch memory so that we
3058  * don't fail access checking.
3059  *
3060  * dtrace_dif_variable() uses this routine as a helper for various
3061  * builtin values such as 'execname' and 'probefunc.'
3062  */
3063 uintptr_t
3064 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3065     dtrace_mstate_t *mstate)
3066 {
3067         uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3068         uintptr_t ret;
3069         size_t strsz;
3070
3071         /*
3072          * The easy case: this probe is allowed to read all of memory, so
3073          * we can just return this as a vanilla pointer.
3074          */
3075         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3076                 return (addr);
3077
3078         /*
3079          * This is the tougher case: we copy the string in question from
3080          * kernel memory into scratch memory and return it that way: this
3081          * ensures that we won't trip up when access checking tests the
3082          * BYREF return value.
3083          */
3084         strsz = dtrace_strlen((char *)addr, size) + 1;
3085
3086         if (mstate->dtms_scratch_ptr + strsz >
3087             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3088                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3089                 return (0);
3090         }
3091
3092         dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3093             strsz);
3094         ret = mstate->dtms_scratch_ptr;
3095         mstate->dtms_scratch_ptr += strsz;
3096         return (ret);
3097 }
3098
3099 /*
3100  * Return a string from a memoy address which is known to have one or
3101  * more concatenated, individually zero terminated, sub-strings.
3102  * In the event that the user lacks the privilege to access
3103  * arbitrary kernel memory, we copy the string out to scratch memory so that we
3104  * don't fail access checking.
3105  *
3106  * dtrace_dif_variable() uses this routine as a helper for various
3107  * builtin values such as 'execargs'.
3108  */
3109 static uintptr_t
3110 dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
3111     dtrace_mstate_t *mstate)
3112 {
3113         char *p;
3114         size_t i;
3115         uintptr_t ret;
3116
3117         if (mstate->dtms_scratch_ptr + strsz >
3118             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3119                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3120                 return (0);
3121         }
3122
3123         dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3124             strsz);
3125
3126         /* Replace sub-string termination characters with a space. */
3127         for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
3128             p++, i++)
3129                 if (*p == '\0')
3130                         *p = ' ';
3131
3132         ret = mstate->dtms_scratch_ptr;
3133         mstate->dtms_scratch_ptr += strsz;
3134         return (ret);
3135 }
3136
3137 /*
3138  * This function implements the DIF emulator's variable lookups.  The emulator
3139  * passes a reserved variable identifier and optional built-in array index.
3140  */
3141 static uint64_t
3142 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3143     uint64_t ndx)
3144 {
3145         /*
3146          * If we're accessing one of the uncached arguments, we'll turn this
3147          * into a reference in the args array.
3148          */
3149         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3150                 ndx = v - DIF_VAR_ARG0;
3151                 v = DIF_VAR_ARGS;
3152         }
3153
3154         switch (v) {
3155         case DIF_VAR_ARGS:
3156                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3157                 if (ndx >= sizeof (mstate->dtms_arg) /
3158                     sizeof (mstate->dtms_arg[0])) {
3159                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3160                         dtrace_provider_t *pv;
3161                         uint64_t val;
3162
3163                         pv = mstate->dtms_probe->dtpr_provider;
3164                         if (pv->dtpv_pops.dtps_getargval != NULL)
3165                                 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3166                                     mstate->dtms_probe->dtpr_id,
3167                                     mstate->dtms_probe->dtpr_arg, ndx, aframes);
3168                         else
3169                                 val = dtrace_getarg(ndx, aframes);
3170
3171                         /*
3172                          * This is regrettably required to keep the compiler
3173                          * from tail-optimizing the call to dtrace_getarg().
3174                          * The condition always evaluates to true, but the
3175                          * compiler has no way of figuring that out a priori.
3176                          * (None of this would be necessary if the compiler
3177                          * could be relied upon to _always_ tail-optimize
3178                          * the call to dtrace_getarg() -- but it can't.)
3179                          */
3180                         if (mstate->dtms_probe != NULL)
3181                                 return (val);
3182
3183                         ASSERT(0);
3184                 }
3185
3186                 return (mstate->dtms_arg[ndx]);
3187
3188 #if defined(sun)
3189         case DIF_VAR_UREGS: {
3190                 klwp_t *lwp;
3191
3192                 if (!dtrace_priv_proc(state))
3193                         return (0);
3194
3195                 if ((lwp = curthread->t_lwp) == NULL) {
3196                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3197                         cpu_core[curcpu].cpuc_dtrace_illval = NULL;
3198                         return (0);
3199                 }
3200
3201                 return (dtrace_getreg(lwp->lwp_regs, ndx));
3202                 return (0);
3203         }
3204 #else
3205         case DIF_VAR_UREGS: {
3206                 struct trapframe *tframe;
3207
3208                 if (!dtrace_priv_proc(state))
3209                         return (0);
3210
3211                 if ((tframe = curthread->td_frame) == NULL) {
3212                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3213                         cpu_core[curcpu].cpuc_dtrace_illval = 0;
3214                         return (0);
3215                 }
3216
3217                 return (dtrace_getreg(tframe, ndx));
3218         }
3219 #endif
3220
3221         case DIF_VAR_CURTHREAD:
3222                 if (!dtrace_priv_proc(state))
3223                         return (0);
3224                 return ((uint64_t)(uintptr_t)curthread);
3225
3226         case DIF_VAR_TIMESTAMP:
3227                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3228                         mstate->dtms_timestamp = dtrace_gethrtime();
3229                         mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3230                 }
3231                 return (mstate->dtms_timestamp);
3232
3233         case DIF_VAR_VTIMESTAMP:
3234                 ASSERT(dtrace_vtime_references != 0);
3235                 return (curthread->t_dtrace_vtime);
3236
3237         case DIF_VAR_WALLTIMESTAMP:
3238                 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3239                         mstate->dtms_walltimestamp = dtrace_gethrestime();
3240                         mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3241                 }
3242                 return (mstate->dtms_walltimestamp);
3243
3244 #if defined(sun)
3245         case DIF_VAR_IPL:
3246                 if (!dtrace_priv_kernel(state))
3247                         return (0);
3248                 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3249                         mstate->dtms_ipl = dtrace_getipl();
3250                         mstate->dtms_present |= DTRACE_MSTATE_IPL;
3251                 }
3252                 return (mstate->dtms_ipl);
3253 #endif
3254
3255         case DIF_VAR_EPID:
3256                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3257                 return (mstate->dtms_epid);
3258
3259         case DIF_VAR_ID:
3260                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3261                 return (mstate->dtms_probe->dtpr_id);
3262
3263         case DIF_VAR_STACKDEPTH:
3264                 if (!dtrace_priv_kernel(state))
3265                         return (0);
3266                 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3267                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3268
3269                         mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3270                         mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3271                 }
3272                 return (mstate->dtms_stackdepth);
3273
3274         case DIF_VAR_USTACKDEPTH:
3275                 if (!dtrace_priv_proc(state))
3276                         return (0);
3277                 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3278                         /*
3279                          * See comment in DIF_VAR_PID.
3280                          */
3281                         if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3282                             CPU_ON_INTR(CPU)) {
3283                                 mstate->dtms_ustackdepth = 0;
3284                         } else {
3285                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3286                                 mstate->dtms_ustackdepth =
3287                                     dtrace_getustackdepth();
3288                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3289                         }
3290                         mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3291                 }
3292                 return (mstate->dtms_ustackdepth);
3293
3294         case DIF_VAR_CALLER:
3295                 if (!dtrace_priv_kernel(state))
3296                         return (0);
3297                 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3298                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3299
3300                         if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3301                                 /*
3302                                  * If this is an unanchored probe, we are
3303                                  * required to go through the slow path:
3304                                  * dtrace_caller() only guarantees correct
3305                                  * results for anchored probes.
3306                                  */
3307                                 pc_t caller[2] = {0, 0};
3308
3309                                 dtrace_getpcstack(caller, 2, aframes,
3310                                     (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3311                                 mstate->dtms_caller = caller[1];
3312                         } else if ((mstate->dtms_caller =
3313                             dtrace_caller(aframes)) == -1) {
3314                                 /*
3315                                  * We have failed to do this the quick way;
3316                                  * we must resort to the slower approach of
3317                                  * calling dtrace_getpcstack().
3318                                  */
3319                                 pc_t caller = 0;
3320
3321                                 dtrace_getpcstack(&caller, 1, aframes, NULL);
3322                                 mstate->dtms_caller = caller;
3323                         }
3324
3325                         mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3326                 }
3327                 return (mstate->dtms_caller);
3328
3329         case DIF_VAR_UCALLER:
3330                 if (!dtrace_priv_proc(state))
3331                         return (0);
3332
3333                 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3334                         uint64_t ustack[3];
3335
3336                         /*
3337                          * dtrace_getupcstack() fills in the first uint64_t
3338                          * with the current PID.  The second uint64_t will
3339                          * be the program counter at user-level.  The third
3340                          * uint64_t will contain the caller, which is what
3341                          * we're after.
3342                          */
3343                         ustack[2] = 0;
3344                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3345                         dtrace_getupcstack(ustack, 3);
3346                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3347                         mstate->dtms_ucaller = ustack[2];
3348                         mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3349                 }
3350
3351                 return (mstate->dtms_ucaller);
3352
3353         case DIF_VAR_PROBEPROV:
3354                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3355                 return (dtrace_dif_varstr(
3356                     (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3357                     state, mstate));
3358
3359         case DIF_VAR_PROBEMOD:
3360                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3361                 return (dtrace_dif_varstr(
3362                     (uintptr_t)mstate->dtms_probe->dtpr_mod,
3363                     state, mstate));
3364
3365         case DIF_VAR_PROBEFUNC:
3366                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3367                 return (dtrace_dif_varstr(
3368                     (uintptr_t)mstate->dtms_probe->dtpr_func,
3369                     state, mstate));
3370
3371         case DIF_VAR_PROBENAME:
3372                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3373                 return (dtrace_dif_varstr(
3374                     (uintptr_t)mstate->dtms_probe->dtpr_name,
3375                     state, mstate));
3376
3377         case DIF_VAR_PID:
3378                 if (!dtrace_priv_proc(state))
3379                         return (0);
3380
3381 #if defined(sun)
3382                 /*
3383                  * Note that we are assuming that an unanchored probe is
3384                  * always due to a high-level interrupt.  (And we're assuming
3385                  * that there is only a single high level interrupt.)
3386                  */
3387                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3388                         return (pid0.pid_id);
3389
3390                 /*
3391                  * It is always safe to dereference one's own t_procp pointer:
3392                  * it always points to a valid, allocated proc structure.
3393                  * Further, it is always safe to dereference the p_pidp member
3394                  * of one's own proc structure.  (These are truisms becuase
3395                  * threads and processes don't clean up their own state --
3396                  * they leave that task to whomever reaps them.)
3397                  */
3398                 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3399 #else
3400                 return ((uint64_t)curproc->p_pid);
3401 #endif
3402
3403         case DIF_VAR_PPID:
3404                 if (!dtrace_priv_proc(state))
3405                         return (0);
3406
3407 #if defined(sun)
3408                 /*
3409                  * See comment in DIF_VAR_PID.
3410                  */
3411                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3412                         return (pid0.pid_id);
3413
3414                 /*
3415                  * It is always safe to dereference one's own t_procp pointer:
3416                  * it always points to a valid, allocated proc structure.
3417                  * (This is true because threads don't clean up their own
3418                  * state -- they leave that task to whomever reaps them.)
3419                  */
3420                 return ((uint64_t)curthread->t_procp->p_ppid);
3421 #else
3422                 if (curproc->p_pid == proc0.p_pid)
3423                         return (curproc->p_pid);
3424                 else
3425                         return (curproc->p_pptr->p_pid);
3426 #endif
3427
3428         case DIF_VAR_TID:
3429 #if defined(sun)
3430                 /*
3431                  * See comment in DIF_VAR_PID.
3432                  */
3433                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3434                         return (0);
3435 #endif
3436
3437                 return ((uint64_t)curthread->t_tid);
3438
3439         case DIF_VAR_EXECARGS: {
3440                 struct pargs *p_args = curthread->td_proc->p_args;
3441
3442                 if (p_args == NULL)
3443                         return(0);
3444
3445                 return (dtrace_dif_varstrz(
3446                     (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
3447         }
3448
3449         case DIF_VAR_EXECNAME:
3450 #if defined(sun)
3451                 if (!dtrace_priv_proc(state))
3452                         return (0);
3453
3454                 /*
3455                  * See comment in DIF_VAR_PID.
3456                  */
3457                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3458                         return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3459
3460                 /*
3461                  * It is always safe to dereference one's own t_procp pointer:
3462                  * it always points to a valid, allocated proc structure.
3463                  * (This is true because threads don't clean up their own
3464                  * state -- they leave that task to whomever reaps them.)
3465                  */
3466                 return (dtrace_dif_varstr(
3467                     (uintptr_t)curthread->t_procp->p_user.u_comm,
3468                     state, mstate));
3469 #else
3470                 return (dtrace_dif_varstr(
3471                     (uintptr_t) curthread->td_proc->p_comm, state, mstate));
3472 #endif
3473
3474         case DIF_VAR_ZONENAME:
3475 #if defined(sun)
3476                 if (!dtrace_priv_proc(state))
3477                         return (0);
3478
3479                 /*
3480                  * See comment in DIF_VAR_PID.
3481                  */
3482                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3483                         return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3484
3485                 /*
3486                  * It is always safe to dereference one's own t_procp pointer:
3487                  * it always points to a valid, allocated proc structure.
3488                  * (This is true because threads don't clean up their own
3489                  * state -- they leave that task to whomever reaps them.)
3490                  */
3491                 return (dtrace_dif_varstr(
3492                     (uintptr_t)curthread->t_procp->p_zone->zone_name,
3493                     state, mstate));
3494 #else
3495                 return (0);
3496 #endif
3497
3498         case DIF_VAR_UID:
3499                 if (!dtrace_priv_proc(state))
3500                         return (0);
3501
3502 #if defined(sun)
3503                 /*
3504                  * See comment in DIF_VAR_PID.
3505                  */
3506                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3507                         return ((uint64_t)p0.p_cred->cr_uid);
3508 #endif
3509
3510                 /*
3511                  * It is always safe to dereference one's own t_procp pointer:
3512                  * it always points to a valid, allocated proc structure.
3513                  * (This is true because threads don't clean up their own
3514                  * state -- they leave that task to whomever reaps them.)
3515                  *
3516                  * Additionally, it is safe to dereference one's own process
3517                  * credential, since this is never NULL after process birth.
3518                  */
3519                 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3520
3521         case DIF_VAR_GID:
3522                 if (!dtrace_priv_proc(state))
3523                         return (0);
3524
3525 #if defined(sun)
3526                 /*
3527                  * See comment in DIF_VAR_PID.
3528                  */
3529                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3530                         return ((uint64_t)p0.p_cred->cr_gid);
3531 #endif
3532
3533                 /*
3534                  * It is always safe to dereference one's own t_procp pointer:
3535                  * it always points to a valid, allocated proc structure.
3536                  * (This is true because threads don't clean up their own
3537                  * state -- they leave that task to whomever reaps them.)
3538                  *
3539                  * Additionally, it is safe to dereference one's own process
3540                  * credential, since this is never NULL after process birth.
3541                  */
3542                 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3543
3544         case DIF_VAR_ERRNO: {
3545 #if defined(sun)
3546                 klwp_t *lwp;
3547                 if (!dtrace_priv_proc(state))
3548                         return (0);
3549
3550                 /*
3551                  * See comment in DIF_VAR_PID.
3552                  */
3553                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3554                         return (0);
3555
3556                 /*
3557                  * It is always safe to dereference one's own t_lwp pointer in
3558                  * the event that this pointer is non-NULL.  (This is true
3559                  * because threads and lwps don't clean up their own state --
3560                  * they leave that task to whomever reaps them.)
3561                  */
3562                 if ((lwp = curthread->t_lwp) == NULL)
3563                         return (0);
3564
3565                 return ((uint64_t)lwp->lwp_errno);
3566 #else
3567                 return (curthread->td_errno);
3568 #endif
3569         }
3570 #if !defined(sun)
3571         case DIF_VAR_CPU: {
3572                 return curcpu;
3573         }
3574 #endif
3575         default:
3576                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3577                 return (0);
3578         }
3579 }
3580
3581
3582 typedef enum dtrace_json_state {
3583         DTRACE_JSON_REST = 1,
3584         DTRACE_JSON_OBJECT,
3585         DTRACE_JSON_STRING,
3586         DTRACE_JSON_STRING_ESCAPE,
3587         DTRACE_JSON_STRING_ESCAPE_UNICODE,
3588         DTRACE_JSON_COLON,
3589         DTRACE_JSON_COMMA,
3590         DTRACE_JSON_VALUE,
3591         DTRACE_JSON_IDENTIFIER,
3592         DTRACE_JSON_NUMBER,
3593         DTRACE_JSON_NUMBER_FRAC,
3594         DTRACE_JSON_NUMBER_EXP,
3595         DTRACE_JSON_COLLECT_OBJECT
3596 } dtrace_json_state_t;
3597
3598 /*
3599  * This function possesses just enough knowledge about JSON to extract a single
3600  * value from a JSON string and store it in the scratch buffer.  It is able
3601  * to extract nested object values, and members of arrays by index.
3602  *
3603  * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3604  * be looked up as we descend into the object tree.  e.g.
3605  *
3606  *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3607  *       with nelems = 5.
3608  *
3609  * The run time of this function must be bounded above by strsize to limit the
3610  * amount of work done in probe context.  As such, it is implemented as a
3611  * simple state machine, reading one character at a time using safe loads
3612  * until we find the requested element, hit a parsing error or run off the
3613  * end of the object or string.
3614  *
3615  * As there is no way for a subroutine to return an error without interrupting
3616  * clause execution, we simply return NULL in the event of a missing key or any
3617  * other error condition.  Each NULL return in this function is commented with
3618  * the error condition it represents -- parsing or otherwise.
3619  *
3620  * The set of states for the state machine closely matches the JSON
3621  * specification (http://json.org/).  Briefly:
3622  *
3623  *   DTRACE_JSON_REST:
3624  *     Skip whitespace until we find either a top-level Object, moving
3625  *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3626  *
3627  *   DTRACE_JSON_OBJECT:
3628  *     Locate the next key String in an Object.  Sets a flag to denote
3629  *     the next String as a key string and moves to DTRACE_JSON_STRING.
3630  *
3631  *   DTRACE_JSON_COLON:
3632  *     Skip whitespace until we find the colon that separates key Strings
3633  *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3634  *
3635  *   DTRACE_JSON_VALUE:
3636  *     Detects the type of the next value (String, Number, Identifier, Object
3637  *     or Array) and routes to the states that process that type.  Here we also
3638  *     deal with the element selector list if we are requested to traverse down
3639  *     into the object tree.
3640  *
3641  *   DTRACE_JSON_COMMA:
3642  *     Skip whitespace until we find the comma that separates key-value pairs
3643  *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3644  *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
3645  *     states return to this state at the end of their value, unless otherwise
3646  *     noted.
3647  *
3648  *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3649  *     Processes a Number literal from the JSON, including any exponent
3650  *     component that may be present.  Numbers are returned as strings, which
3651  *     may be passed to strtoll() if an integer is required.
3652  *
3653  *   DTRACE_JSON_IDENTIFIER:
3654  *     Processes a "true", "false" or "null" literal in the JSON.
3655  *
3656  *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3657  *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
3658  *     Processes a String literal from the JSON, whether the String denotes
3659  *     a key, a value or part of a larger Object.  Handles all escape sequences
3660  *     present in the specification, including four-digit unicode characters,
3661  *     but merely includes the escape sequence without converting it to the
3662  *     actual escaped character.  If the String is flagged as a key, we
3663  *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3664  *
3665  *   DTRACE_JSON_COLLECT_OBJECT:
3666  *     This state collects an entire Object (or Array), correctly handling
3667  *     embedded strings.  If the full element selector list matches this nested
3668  *     object, we return the Object in full as a string.  If not, we use this
3669  *     state to skip to the next value at this level and continue processing.
3670  *
3671  * NOTE: This function uses various macros from strtolctype.h to manipulate
3672  * digit values, etc -- these have all been checked to ensure they make
3673  * no additional function calls.
3674  */
3675 static char *
3676 dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3677     char *dest)
3678 {
3679         dtrace_json_state_t state = DTRACE_JSON_REST;
3680         int64_t array_elem = INT64_MIN;
3681         int64_t array_pos = 0;
3682         uint8_t escape_unicount = 0;
3683         boolean_t string_is_key = B_FALSE;
3684         boolean_t collect_object = B_FALSE;
3685         boolean_t found_key = B_FALSE;
3686         boolean_t in_array = B_FALSE;
3687         uint32_t braces = 0, brackets = 0;
3688         char *elem = elemlist;
3689         char *dd = dest;
3690         uintptr_t cur;
3691
3692         for (cur = json; cur < json + size; cur++) {
3693                 char cc = dtrace_load8(cur);
3694                 if (cc == '\0')
3695                         return (NULL);
3696
3697                 switch (state) {
3698                 case DTRACE_JSON_REST:
3699                         if (isspace(cc))
3700                                 break;
3701
3702                         if (cc == '{') {
3703                                 state = DTRACE_JSON_OBJECT;
3704                                 break;
3705                         }
3706
3707                         if (cc == '[') {
3708                                 in_array = B_TRUE;
3709                                 array_pos = 0;
3710                                 array_elem = dtrace_strtoll(elem, 10, size);
3711                                 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3712                                 state = DTRACE_JSON_VALUE;
3713                                 break;
3714                         }
3715
3716                         /*
3717                          * ERROR: expected to find a top-level object or array.
3718                          */
3719                         return (NULL);
3720                 case DTRACE_JSON_OBJECT:
3721                         if (isspace(cc))
3722                                 break;
3723
3724                         if (cc == '"') {
3725                                 state = DTRACE_JSON_STRING;
3726                                 string_is_key = B_TRUE;
3727                                 break;
3728                         }
3729
3730                         /*
3731                          * ERROR: either the object did not start with a key
3732                          * string, or we've run off the end of the object
3733                          * without finding the requested key.
3734                          */
3735                         return (NULL);
3736                 case DTRACE_JSON_STRING:
3737                         if (cc == '\\') {
3738                                 *dd++ = '\\';
3739                                 state = DTRACE_JSON_STRING_ESCAPE;
3740                                 break;
3741                         }
3742
3743                         if (cc == '"') {
3744                                 if (collect_object) {
3745                                         /*
3746                                          * We don't reset the dest here, as
3747                                          * the string is part of a larger
3748                                          * object being collected.
3749                                          */
3750                                         *dd++ = cc;
3751                                         collect_object = B_FALSE;
3752                                         state = DTRACE_JSON_COLLECT_OBJECT;
3753                                         break;
3754                                 }
3755                                 *dd = '\0';
3756                                 dd = dest; /* reset string buffer */
3757                                 if (string_is_key) {
3758                                         if (dtrace_strncmp(dest, elem,
3759                                             size) == 0)
3760                                                 found_key = B_TRUE;
3761                                 } else if (found_key) {
3762                                         if (nelems > 1) {
3763                                                 /*
3764                                                  * We expected an object, not
3765                                                  * this string.
3766                                                  */
3767                                                 return (NULL);
3768                                         }
3769                                         return (dest);
3770                                 }
3771                                 state = string_is_key ? DTRACE_JSON_COLON :
3772                                     DTRACE_JSON_COMMA;
3773                                 string_is_key = B_FALSE;
3774                                 break;
3775                         }
3776
3777                         *dd++ = cc;
3778                         break;
3779                 case DTRACE_JSON_STRING_ESCAPE:
3780                         *dd++ = cc;
3781                         if (cc == 'u') {
3782                                 escape_unicount = 0;
3783                                 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3784                         } else {
3785                                 state = DTRACE_JSON_STRING;
3786                         }
3787                         break;
3788                 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3789                         if (!isxdigit(cc)) {
3790                                 /*
3791                                  * ERROR: invalid unicode escape, expected
3792                                  * four valid hexidecimal digits.
3793                                  */
3794                                 return (NULL);
3795                         }
3796
3797                         *dd++ = cc;
3798                         if (++escape_unicount == 4)
3799                                 state = DTRACE_JSON_STRING;
3800                         break;
3801                 case DTRACE_JSON_COLON:
3802                         if (isspace(cc))
3803                                 break;
3804
3805                         if (cc == ':') {
3806                                 state = DTRACE_JSON_VALUE;
3807                                 break;
3808                         }
3809
3810                         /*
3811                          * ERROR: expected a colon.
3812                          */
3813                         return (NULL);
3814                 case DTRACE_JSON_COMMA:
3815                         if (isspace(cc))
3816                                 break;
3817
3818                         if (cc == ',') {
3819                                 if (in_array) {
3820                                         state = DTRACE_JSON_VALUE;
3821                                         if (++array_pos == array_elem)
3822                                                 found_key = B_TRUE;
3823                                 } else {
3824                                         state = DTRACE_JSON_OBJECT;
3825                                 }
3826                                 break;
3827                         }
3828
3829                         /*
3830                          * ERROR: either we hit an unexpected character, or
3831                          * we reached the end of the object or array without
3832                          * finding the requested key.
3833                          */
3834                         return (NULL);
3835                 case DTRACE_JSON_IDENTIFIER:
3836                         if (islower(cc)) {
3837                                 *dd++ = cc;
3838                                 break;
3839                         }
3840
3841                         *dd = '\0';
3842                         dd = dest; /* reset string buffer */
3843
3844                         if (dtrace_strncmp(dest, "true", 5) == 0 ||
3845                             dtrace_strncmp(dest, "false", 6) == 0 ||
3846                             dtrace_strncmp(dest, "null", 5) == 0) {
3847                                 if (found_key) {
3848                                         if (nelems > 1) {
3849                                                 /*
3850                                                  * ERROR: We expected an object,
3851                                                  * not this identifier.
3852                                                  */
3853                                                 return (NULL);
3854                                         }
3855                                         return (dest);
3856                                 } else {
3857                                         cur--;
3858                                         state = DTRACE_JSON_COMMA;
3859                                         break;
3860                                 }
3861                         }
3862
3863                         /*
3864                          * ERROR: we did not recognise the identifier as one
3865                          * of those in the JSON specification.
3866                          */
3867                         return (NULL);
3868                 case DTRACE_JSON_NUMBER:
3869                         if (cc == '.') {
3870                                 *dd++ = cc;
3871                                 state = DTRACE_JSON_NUMBER_FRAC;
3872                                 break;
3873                         }
3874
3875                         if (cc == 'x' || cc == 'X') {
3876                                 /*
3877                                  * ERROR: specification explicitly excludes
3878                                  * hexidecimal or octal numbers.
3879                                  */
3880                                 return (NULL);
3881                         }
3882
3883                         /* FALLTHRU */
3884                 case DTRACE_JSON_NUMBER_FRAC:
3885                         if (cc == 'e' || cc == 'E') {
3886                                 *dd++ = cc;
3887                                 state = DTRACE_JSON_NUMBER_EXP;
3888                                 break;
3889                         }
3890
3891                         if (cc == '+' || cc == '-') {
3892                                 /*
3893                                  * ERROR: expect sign as part of exponent only.
3894                                  */
3895                                 return (NULL);
3896                         }
3897                         /* FALLTHRU */
3898                 case DTRACE_JSON_NUMBER_EXP:
3899                         if (isdigit(cc) || cc == '+' || cc == '-') {
3900                                 *dd++ = cc;
3901                                 break;
3902                         }
3903
3904                         *dd = '\0';
3905                         dd = dest; /* reset string buffer */
3906                         if (found_key) {
3907                                 if (nelems > 1) {
3908                                         /*
3909                                          * ERROR: We expected an object, not
3910                                          * this number.
3911                                          */
3912                                         return (NULL);
3913                                 }
3914                                 return (dest);
3915                         }
3916
3917                         cur--;
3918                         state = DTRACE_JSON_COMMA;
3919                         break;
3920                 case DTRACE_JSON_VALUE:
3921                         if (isspace(cc))
3922                                 break;
3923
3924                         if (cc == '{' || cc == '[') {
3925                                 if (nelems > 1 && found_key) {
3926                                         in_array = cc == '[' ? B_TRUE : B_FALSE;
3927                                         /*
3928                                          * If our element selector directs us
3929                                          * to descend into this nested object,
3930                                          * then move to the next selector
3931                                          * element in the list and restart the
3932                                          * state machine.
3933                                          */
3934                                         while (*elem != '\0')
3935                                                 elem++;
3936                                         elem++; /* skip the inter-element NUL */
3937                                         nelems--;
3938                                         dd = dest;
3939                                         if (in_array) {
3940                                                 state = DTRACE_JSON_VALUE;
3941                                                 array_pos = 0;
3942                                                 array_elem = dtrace_strtoll(
3943                                                     elem, 10, size);
3944                                                 found_key = array_elem == 0 ?
3945                                                     B_TRUE : B_FALSE;
3946                                         } else {
3947                                                 found_key = B_FALSE;
3948                                                 state = DTRACE_JSON_OBJECT;
3949                                         }
3950                                         break;
3951                                 }
3952
3953                                 /*
3954                                  * Otherwise, we wish to either skip this
3955                                  * nested object or return it in full.
3956                                  */
3957                                 if (cc == '[')
3958                                         brackets = 1;
3959                                 else
3960                                         braces = 1;
3961                                 *dd++ = cc;
3962                                 state = DTRACE_JSON_COLLECT_OBJECT;
3963                                 break;
3964                         }
3965
3966                         if (cc == '"') {
3967                                 state = DTRACE_JSON_STRING;
3968                                 break;
3969                         }
3970
3971                         if (islower(cc)) {
3972                                 /*
3973                                  * Here we deal with true, false and null.
3974                                  */
3975                                 *dd++ = cc;
3976                                 state = DTRACE_JSON_IDENTIFIER;
3977                                 break;
3978                         }
3979
3980                         if (cc == '-' || isdigit(cc)) {
3981                                 *dd++ = cc;
3982                                 state = DTRACE_JSON_NUMBER;
3983                                 break;
3984                         }
3985
3986                         /*
3987                          * ERROR: unexpected character at start of value.
3988                          */
3989                         return (NULL);
3990                 case DTRACE_JSON_COLLECT_OBJECT:
3991                         if (cc == '\0')
3992                                 /*
3993                                  * ERROR: unexpected end of input.
3994                                  */
3995                                 return (NULL);
3996
3997                         *dd++ = cc;
3998                         if (cc == '"') {
3999                                 collect_object = B_TRUE;
4000                                 state = DTRACE_JSON_STRING;
4001                                 break;
4002                         }
4003
4004                         if (cc == ']') {
4005                                 if (brackets-- == 0) {
4006                                         /*
4007                                          * ERROR: unbalanced brackets.
4008                                          */
4009                                         return (NULL);
4010                                 }
4011                         } else if (cc == '}') {
4012                                 if (braces-- == 0) {
4013                                         /*
4014                                          * ERROR: unbalanced braces.
4015                                          */
4016                                         return (NULL);
4017                                 }
4018                         } else if (cc == '{') {
4019                                 braces++;
4020                         } else if (cc == '[') {
4021                                 brackets++;
4022                         }
4023
4024                         if (brackets == 0 && braces == 0) {
4025                                 if (found_key) {
4026                                         *dd = '\0';
4027                                         return (dest);
4028                                 }
4029                                 dd = dest; /* reset string buffer */
4030                                 state = DTRACE_JSON_COMMA;
4031                         }
4032                         break;
4033                 }
4034         }
4035         return (NULL);
4036 }
4037
4038 /*
4039  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4040  * Notice that we don't bother validating the proper number of arguments or
4041  * their types in the tuple stack.  This isn't needed because all argument
4042  * interpretation is safe because of our load safety -- the worst that can
4043  * happen is that a bogus program can obtain bogus results.
4044  */
4045 static void
4046 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4047     dtrace_key_t *tupregs, int nargs,
4048     dtrace_mstate_t *mstate, dtrace_state_t *state)
4049 {
4050         volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
4051         volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
4052         dtrace_vstate_t *vstate = &state->dts_vstate;
4053
4054 #if defined(sun)
4055         union {
4056                 mutex_impl_t mi;
4057                 uint64_t mx;
4058         } m;
4059
4060         union {
4061                 krwlock_t ri;
4062                 uintptr_t rw;
4063         } r;
4064 #else
4065         struct thread *lowner;
4066         union {
4067                 struct lock_object *li;
4068                 uintptr_t lx;
4069         } l;
4070 #endif
4071
4072         switch (subr) {
4073         case DIF_SUBR_RAND:
4074                 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
4075                 break;
4076
4077 #if defined(sun)
4078         case DIF_SUBR_MUTEX_OWNED:
4079                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4080                     mstate, vstate)) {
4081                         regs[rd] = 0;
4082                         break;
4083                 }
4084
4085                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4086                 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4087                         regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4088                 else
4089                         regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4090                 break;
4091
4092         case DIF_SUBR_MUTEX_OWNER:
4093                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4094                     mstate, vstate)) {
4095                         regs[rd] = 0;
4096                         break;
4097                 }
4098
4099                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4100                 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4101                     MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4102                         regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4103                 else
4104                         regs[rd] = 0;
4105                 break;
4106
4107         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4108                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4109                     mstate, vstate)) {
4110                         regs[rd] = 0;
4111                         break;
4112                 }
4113
4114                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4115                 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4116                 break;
4117
4118         case DIF_SUBR_MUTEX_TYPE_SPIN:
4119                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4120                     mstate, vstate)) {
4121                         regs[rd] = 0;
4122                         break;
4123                 }
4124
4125                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4126                 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4127                 break;
4128
4129         case DIF_SUBR_RW_READ_HELD: {
4130                 uintptr_t tmp;
4131
4132                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4133                     mstate, vstate)) {
4134                         regs[rd] = 0;
4135                         break;
4136                 }
4137
4138                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4139                 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4140                 break;
4141         }
4142
4143         case DIF_SUBR_RW_WRITE_HELD:
4144                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4145                     mstate, vstate)) {
4146                         regs[rd] = 0;
4147                         break;
4148                 }
4149
4150                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4151                 regs[rd] = _RW_WRITE_HELD(&r.ri);
4152                 break;
4153
4154         case DIF_SUBR_RW_ISWRITER:
4155                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4156                     mstate, vstate)) {
4157                         regs[rd] = 0;
4158                         break;
4159                 }
4160
4161                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4162                 regs[rd] = _RW_ISWRITER(&r.ri);
4163                 break;
4164
4165 #else
4166         case DIF_SUBR_MUTEX_OWNED:
4167                 if (!dtrace_canload(tupregs[0].dttk_value,
4168                         sizeof (struct lock_object), mstate, vstate)) {
4169                         regs[rd] = 0;
4170                         break;
4171                 }
4172                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4173                 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4174                 break;
4175
4176         case DIF_SUBR_MUTEX_OWNER:
4177                 if (!dtrace_canload(tupregs[0].dttk_value,
4178                         sizeof (struct lock_object), mstate, vstate)) {
4179                         regs[rd] = 0;
4180                         break;
4181                 }
4182                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4183                 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4184                 regs[rd] = (uintptr_t)lowner;
4185                 break;
4186
4187         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4188                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4189                     mstate, vstate)) {
4190                         regs[rd] = 0;
4191                         break;
4192                 }
4193                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4194                 /* XXX - should be only LC_SLEEPABLE? */
4195                 regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
4196                     (LC_SLEEPLOCK | LC_SLEEPABLE)) != 0;
4197                 break;
4198
4199         case DIF_SUBR_MUTEX_TYPE_SPIN:
4200                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4201                     mstate, vstate)) {
4202                         regs[rd] = 0;
4203                         break;
4204                 }
4205                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4206                 regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
4207                 break;
4208
4209         case DIF_SUBR_RW_READ_HELD:
4210         case DIF_SUBR_SX_SHARED_HELD:
4211                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4212                     mstate, vstate)) {
4213                         regs[rd] = 0;
4214                         break;
4215                 }
4216                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4217                 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4218                     lowner == NULL;
4219                 break;
4220
4221         case DIF_SUBR_RW_WRITE_HELD:
4222         case DIF_SUBR_SX_EXCLUSIVE_HELD:
4223                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4224                     mstate, vstate)) {
4225                         regs[rd] = 0;
4226                         break;
4227                 }
4228                 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4229                 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4230                 regs[rd] = (lowner == curthread);
4231                 break;
4232
4233         case DIF_SUBR_RW_ISWRITER:
4234         case DIF_SUBR_SX_ISEXCLUSIVE:
4235                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4236                     mstate, vstate)) {
4237                         regs[rd] = 0;
4238                         break;
4239                 }
4240                 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4241                 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4242                     lowner != NULL;
4243                 break;
4244 #endif /* ! defined(sun) */
4245
4246         case DIF_SUBR_BCOPY: {
4247                 /*
4248                  * We need to be sure that the destination is in the scratch
4249                  * region -- no other region is allowed.
4250                  */
4251                 uintptr_t src = tupregs[0].dttk_value;
4252                 uintptr_t dest = tupregs[1].dttk_value;
4253                 size_t size = tupregs[2].dttk_value;
4254
4255                 if (!dtrace_inscratch(dest, size, mstate)) {
4256                         *flags |= CPU_DTRACE_BADADDR;
4257                         *illval = regs[rd];
4258                         break;
4259                 }
4260
4261                 if (!dtrace_canload(src, size, mstate, vstate)) {
4262                         regs[rd] = 0;
4263                         break;
4264                 }
4265
4266                 dtrace_bcopy((void *)src, (void *)dest, size);
4267                 break;
4268         }
4269
4270         case DIF_SUBR_ALLOCA:
4271         case DIF_SUBR_COPYIN: {
4272                 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4273                 uint64_t size =
4274                     tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4275                 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4276
4277                 /*
4278                  * This action doesn't require any credential checks since
4279                  * probes will not activate in user contexts to which the
4280                  * enabling user does not have permissions.
4281                  */
4282
4283                 /*
4284                  * Rounding up the user allocation size could have overflowed
4285                  * a large, bogus allocation (like -1ULL) to 0.
4286                  */
4287                 if (scratch_size < size ||
4288                     !DTRACE_INSCRATCH(mstate, scratch_size)) {
4289                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4290                         regs[rd] = 0;
4291                         break;
4292                 }
4293
4294                 if (subr == DIF_SUBR_COPYIN) {
4295                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4296                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4297                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4298                 }
4299
4300                 mstate->dtms_scratch_ptr += scratch_size;
4301                 regs[rd] = dest;
4302                 break;
4303         }
4304
4305         case DIF_SUBR_COPYINTO: {
4306                 uint64_t size = tupregs[1].dttk_value;
4307                 uintptr_t dest = tupregs[2].dttk_value;
4308
4309                 /*
4310                  * This action doesn't require any credential checks since
4311                  * probes will not activate in user contexts to which the
4312                  * enabling user does not have permissions.
4313                  */
4314                 if (!dtrace_inscratch(dest, size, mstate)) {
4315                         *flags |= CPU_DTRACE_BADADDR;
4316                         *illval = regs[rd];
4317                         break;
4318                 }
4319
4320                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4321                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4322                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4323                 break;
4324         }
4325
4326         case DIF_SUBR_COPYINSTR: {
4327                 uintptr_t dest = mstate->dtms_scratch_ptr;
4328                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4329
4330                 if (nargs > 1 && tupregs[1].dttk_value < size)
4331                         size = tupregs[1].dttk_value + 1;
4332
4333                 /*
4334                  * This action doesn't require any credential checks since
4335                  * probes will not activate in user contexts to which the
4336                  * enabling user does not have permissions.
4337                  */
4338                 if (!DTRACE_INSCRATCH(mstate, size)) {
4339                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4340                         regs[rd] = 0;
4341                         break;
4342                 }
4343
4344                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4345                 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4346                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4347
4348                 ((char *)dest)[size - 1] = '\0';
4349                 mstate->dtms_scratch_ptr += size;
4350                 regs[rd] = dest;
4351                 break;
4352         }
4353
4354 #if defined(sun)
4355         case DIF_SUBR_MSGSIZE:
4356         case DIF_SUBR_MSGDSIZE: {
4357                 uintptr_t baddr = tupregs[0].dttk_value, daddr;
4358                 uintptr_t wptr, rptr;
4359                 size_t count = 0;
4360                 int cont = 0;
4361
4362                 while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
4363
4364                         if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
4365                             vstate)) {
4366                                 regs[rd] = 0;
4367                                 break;
4368                         }
4369
4370                         wptr = dtrace_loadptr(baddr +
4371                             offsetof(mblk_t, b_wptr));
4372
4373                         rptr = dtrace_loadptr(baddr +
4374                             offsetof(mblk_t, b_rptr));
4375
4376                         if (wptr < rptr) {
4377                                 *flags |= CPU_DTRACE_BADADDR;
4378                                 *illval = tupregs[0].dttk_value;
4379                                 break;
4380                         }
4381
4382                         daddr = dtrace_loadptr(baddr +
4383                             offsetof(mblk_t, b_datap));
4384
4385                         baddr = dtrace_loadptr(baddr +
4386                             offsetof(mblk_t, b_cont));
4387
4388                         /*
4389                          * We want to prevent against denial-of-service here,
4390                          * so we're only going to search the list for
4391                          * dtrace_msgdsize_max mblks.
4392                          */
4393                         if (cont++ > dtrace_msgdsize_max) {
4394                                 *flags |= CPU_DTRACE_ILLOP;
4395                                 break;
4396                         }
4397
4398                         if (subr == DIF_SUBR_MSGDSIZE) {
4399                                 if (dtrace_load8(daddr +
4400                                     offsetof(dblk_t, db_type)) != M_DATA)
4401                                         continue;
4402                         }
4403
4404                         count += wptr - rptr;
4405                 }
4406
4407                 if (!(*flags & CPU_DTRACE_FAULT))
4408                         regs[rd] = count;
4409
4410                 break;
4411         }
4412 #endif
4413
4414         case DIF_SUBR_PROGENYOF: {
4415                 pid_t pid = tupregs[0].dttk_value;
4416                 proc_t *p;
4417                 int rval = 0;
4418
4419                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4420
4421                 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
4422 #if defined(sun)
4423                         if (p->p_pidp->pid_id == pid) {
4424 #else
4425                         if (p->p_pid == pid) {
4426 #endif
4427                                 rval = 1;
4428                                 break;
4429                         }
4430                 }
4431
4432                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4433
4434                 regs[rd] = rval;
4435                 break;
4436         }
4437
4438         case DIF_SUBR_SPECULATION:
4439                 regs[rd] = dtrace_speculation(state);
4440                 break;
4441
4442         case DIF_SUBR_COPYOUT: {
4443                 uintptr_t kaddr = tupregs[0].dttk_value;
4444                 uintptr_t uaddr = tupregs[1].dttk_value;
4445                 uint64_t size = tupregs[2].dttk_value;
4446
4447                 if (!dtrace_destructive_disallow &&
4448                     dtrace_priv_proc_control(state) &&
4449                     !dtrace_istoxic(kaddr, size)) {
4450                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4451                         dtrace_copyout(kaddr, uaddr, size, flags);
4452                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4453                 }
4454                 break;
4455         }
4456
4457         case DIF_SUBR_COPYOUTSTR: {
4458                 uintptr_t kaddr = tupregs[0].dttk_value;
4459                 uintptr_t uaddr = tupregs[1].dttk_value;
4460                 uint64_t size = tupregs[2].dttk_value;
4461
4462                 if (!dtrace_destructive_disallow &&
4463                     dtrace_priv_proc_control(state) &&
4464                     !dtrace_istoxic(kaddr, size)) {
4465                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4466                         dtrace_copyoutstr(kaddr, uaddr, size, flags);
4467                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4468                 }
4469                 break;
4470         }
4471
4472         case DIF_SUBR_STRLEN: {
4473                 size_t sz;
4474                 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4475                 sz = dtrace_strlen((char *)addr,
4476                     state->dts_options[DTRACEOPT_STRSIZE]);
4477
4478                 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
4479                         regs[rd] = 0;
4480                         break;
4481                 }
4482
4483                 regs[rd] = sz;
4484
4485                 break;
4486         }
4487
4488         case DIF_SUBR_STRCHR:
4489         case DIF_SUBR_STRRCHR: {
4490                 /*
4491                  * We're going to iterate over the string looking for the
4492                  * specified character.  We will iterate until we have reached
4493                  * the string length or we have found the character.  If this
4494                  * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4495                  * of the specified character instead of the first.
4496                  */
4497                 uintptr_t saddr = tupregs[0].dttk_value;
4498                 uintptr_t addr = tupregs[0].dttk_value;
4499                 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
4500                 char c, target = (char)tupregs[1].dttk_value;
4501
4502                 for (regs[rd] = 0; addr < limit; addr++) {
4503                         if ((c = dtrace_load8(addr)) == target) {
4504                                 regs[rd] = addr;
4505
4506                                 if (subr == DIF_SUBR_STRCHR)
4507                                         break;
4508                         }
4509
4510                         if (c == '\0')
4511                                 break;
4512                 }
4513
4514                 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
4515                         regs[rd] = 0;
4516                         break;
4517                 }
4518
4519                 break;
4520         }
4521
4522         case DIF_SUBR_STRSTR:
4523         case DIF_SUBR_INDEX:
4524         case DIF_SUBR_RINDEX: {
4525                 /*
4526                  * We're going to iterate over the string looking for the
4527                  * specified string.  We will iterate until we have reached
4528                  * the string length or we have found the string.  (Yes, this
4529                  * is done in the most naive way possible -- but considering
4530                  * that the string we're searching for is likely to be
4531                  * relatively short, the complexity of Rabin-Karp or similar
4532                  * hardly seems merited.)
4533                  */
4534                 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4535                 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4536                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4537                 size_t len = dtrace_strlen(addr, size);
4538                 size_t sublen = dtrace_strlen(substr, size);
4539                 char *limit = addr + len, *orig = addr;
4540                 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4541                 int inc = 1;
4542
4543                 regs[rd] = notfound;
4544
4545                 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4546                         regs[rd] = 0;
4547                         break;
4548                 }
4549
4550                 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4551                     vstate)) {
4552                         regs[rd] = 0;
4553                         break;
4554                 }
4555
4556                 /*
4557                  * strstr() and index()/rindex() have similar semantics if
4558                  * both strings are the empty string: strstr() returns a
4559                  * pointer to the (empty) string, and index() and rindex()
4560                  * both return index 0 (regardless of any position argument).
4561                  */
4562                 if (sublen == 0 && len == 0) {
4563                         if (subr == DIF_SUBR_STRSTR)
4564                                 regs[rd] = (uintptr_t)addr;
4565                         else
4566                                 regs[rd] = 0;
4567                         break;
4568                 }
4569
4570                 if (subr != DIF_SUBR_STRSTR) {
4571                         if (subr == DIF_SUBR_RINDEX) {
4572                                 limit = orig - 1;
4573                                 addr += len;
4574                                 inc = -1;
4575                         }
4576
4577                         /*
4578                          * Both index() and rindex() take an optional position
4579                          * argument that denotes the starting position.
4580                          */
4581                         if (nargs == 3) {
4582                                 int64_t pos = (int64_t)tupregs[2].dttk_value;
4583
4584                                 /*
4585                                  * If the position argument to index() is
4586                                  * negative, Perl implicitly clamps it at
4587                                  * zero.  This semantic is a little surprising
4588                                  * given the special meaning of negative
4589                                  * positions to similar Perl functions like
4590                                  * substr(), but it appears to reflect a
4591                                  * notion that index() can start from a
4592                                  * negative index and increment its way up to
4593                                  * the string.  Given this notion, Perl's
4594                                  * rindex() is at least self-consistent in
4595                                  * that it implicitly clamps positions greater
4596                                  * than the string length to be the string
4597                                  * length.  Where Perl completely loses
4598                                  * coherence, however, is when the specified
4599                                  * substring is the empty string ("").  In
4600                                  * this case, even if the position is
4601                                  * negative, rindex() returns 0 -- and even if
4602                                  * the position is greater than the length,
4603                                  * index() returns the string length.  These
4604                                  * semantics violate the notion that index()
4605                                  * should never return a value less than the
4606                                  * specified position and that rindex() should
4607                                  * never return a value greater than the
4608                                  * specified position.  (One assumes that
4609                                  * these semantics are artifacts of Perl's
4610                                  * implementation and not the results of
4611                                  * deliberate design -- it beggars belief that
4612                                  * even Larry Wall could desire such oddness.)
4613                                  * While in the abstract one would wish for
4614                                  * consistent position semantics across
4615                                  * substr(), index() and rindex() -- or at the
4616                                  * very least self-consistent position
4617                                  * semantics for index() and rindex() -- we
4618                                  * instead opt to keep with the extant Perl
4619                                  * semantics, in all their broken glory.  (Do
4620                                  * we have more desire to maintain Perl's
4621                                  * semantics than Perl does?  Probably.)
4622                                  */
4623                                 if (subr == DIF_SUBR_RINDEX) {
4624                                         if (pos < 0) {
4625                                                 if (sublen == 0)
4626                                                         regs[rd] = 0;
4627                                                 break;
4628                                         }
4629
4630                                         if (pos > len)
4631                                                 pos = len;
4632                                 } else {
4633                                         if (pos < 0)
4634                                                 pos = 0;
4635
4636                                         if (pos >= len) {
4637                                                 if (sublen == 0)
4638                                                         regs[rd] = len;
4639                                                 break;
4640                                         }
4641                                 }
4642
4643                                 addr = orig + pos;
4644                         }
4645                 }
4646
4647                 for (regs[rd] = notfound; addr != limit; addr += inc) {
4648                         if (dtrace_strncmp(addr, substr, sublen) == 0) {
4649                                 if (subr != DIF_SUBR_STRSTR) {
4650                                         /*
4651                                          * As D index() and rindex() are
4652                                          * modeled on Perl (and not on awk),
4653                                          * we return a zero-based (and not a
4654                                          * one-based) index.  (For you Perl
4655                                          * weenies: no, we're not going to add
4656                                          * $[ -- and shouldn't you be at a con
4657                                          * or something?)
4658                                          */
4659                                         regs[rd] = (uintptr_t)(addr - orig);
4660                                         break;
4661                                 }
4662
4663                                 ASSERT(subr == DIF_SUBR_STRSTR);
4664                                 regs[rd] = (uintptr_t)addr;
4665                                 break;
4666                         }
4667                 }
4668
4669                 break;
4670         }
4671
4672         case DIF_SUBR_STRTOK: {
4673                 uintptr_t addr = tupregs[0].dttk_value;
4674                 uintptr_t tokaddr = tupregs[1].dttk_value;
4675                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4676                 uintptr_t limit, toklimit = tokaddr + size;
4677                 uint8_t c = 0, tokmap[32];       /* 256 / 8 */
4678                 char *dest = (char *)mstate->dtms_scratch_ptr;
4679                 int i;
4680
4681                 /*
4682                  * Check both the token buffer and (later) the input buffer,
4683                  * since both could be non-scratch addresses.
4684                  */
4685                 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
4686                         regs[rd] = 0;
4687                         break;
4688                 }
4689
4690                 if (!DTRACE_INSCRATCH(mstate, size)) {
4691                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4692                         regs[rd] = 0;
4693                         break;
4694                 }
4695
4696                 if (addr == 0) {
4697                         /*
4698                          * If the address specified is NULL, we use our saved
4699                          * strtok pointer from the mstate.  Note that this
4700                          * means that the saved strtok pointer is _only_
4701                          * valid within multiple enablings of the same probe --
4702                          * it behaves like an implicit clause-local variable.
4703                          */
4704                         addr = mstate->dtms_strtok;
4705                 } else {
4706                         /*
4707                          * If the user-specified address is non-NULL we must
4708                          * access check it.  This is the only time we have
4709                          * a chance to do so, since this address may reside
4710                          * in the string table of this clause-- future calls
4711                          * (when we fetch addr from mstate->dtms_strtok)
4712                          * would fail this access check.
4713                          */
4714                         if (!dtrace_strcanload(addr, size, mstate, vstate)) {
4715                                 regs[rd] = 0;
4716                                 break;
4717                         }
4718                 }
4719
4720                 /*
4721                  * First, zero the token map, and then process the token
4722                  * string -- setting a bit in the map for every character
4723                  * found in the token string.
4724                  */
4725                 for (i = 0; i < sizeof (tokmap); i++)
4726                         tokmap[i] = 0;
4727
4728                 for (; tokaddr < toklimit; tokaddr++) {
4729                         if ((c = dtrace_load8(tokaddr)) == '\0')
4730                                 break;
4731
4732                         ASSERT((c >> 3) < sizeof (tokmap));
4733                         tokmap[c >> 3] |= (1 << (c & 0x7));
4734                 }
4735
4736                 for (limit = addr + size; addr < limit; addr++) {
4737                         /*
4738                          * We're looking for a character that is _not_ contained
4739                          * in the token string.
4740                          */
4741                         if ((c = dtrace_load8(addr)) == '\0')
4742                                 break;
4743
4744                         if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4745                                 break;
4746                 }
4747
4748                 if (c == '\0') {
4749                         /*
4750                          * We reached the end of the string without finding
4751                          * any character that was not in the token string.
4752                          * We return NULL in this case, and we set the saved
4753                          * address to NULL as well.
4754                          */
4755                         regs[rd] = 0;
4756                         mstate->dtms_strtok = 0;
4757                         break;
4758                 }
4759
4760                 /*
4761                  * From here on, we're copying into the destination string.
4762                  */
4763                 for (i = 0; addr < limit && i < size - 1; addr++) {
4764                         if ((c = dtrace_load8(addr)) == '\0')
4765                                 break;
4766
4767                         if (tokmap[c >> 3] & (1 << (c & 0x7)))
4768                                 break;
4769
4770                         ASSERT(i < size);
4771                         dest[i++] = c;
4772                 }
4773
4774                 ASSERT(i < size);
4775                 dest[i] = '\0';
4776                 regs[rd] = (uintptr_t)dest;
4777                 mstate->dtms_scratch_ptr += size;
4778                 mstate->dtms_strtok = addr;
4779                 break;
4780         }
4781
4782         case DIF_SUBR_SUBSTR: {
4783                 uintptr_t s = tupregs[0].dttk_value;
4784                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4785                 char *d = (char *)mstate->dtms_scratch_ptr;
4786                 int64_t index = (int64_t)tupregs[1].dttk_value;
4787                 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4788                 size_t len = dtrace_strlen((char *)s, size);
4789                 int64_t i;
4790
4791                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4792                         regs[rd] = 0;
4793                         break;
4794                 }
4795
4796                 if (!DTRACE_INSCRATCH(mstate, size)) {
4797                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4798                         regs[rd] = 0;
4799                         break;
4800                 }
4801
4802                 if (nargs <= 2)
4803                         remaining = (int64_t)size;
4804
4805                 if (index < 0) {
4806                         index += len;
4807
4808                         if (index < 0 && index + remaining > 0) {
4809                                 remaining += index;
4810                                 index = 0;
4811                         }
4812                 }
4813
4814                 if (index >= len || index < 0) {
4815                         remaining = 0;
4816                 } else if (remaining < 0) {
4817                         remaining += len - index;
4818                 } else if (index + remaining > size) {
4819                         remaining = size - index;
4820                 }
4821
4822                 for (i = 0; i < remaining; i++) {
4823                         if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4824                                 break;
4825                 }
4826
4827                 d[i] = '\0';
4828
4829                 mstate->dtms_scratch_ptr += size;
4830                 regs[rd] = (uintptr_t)d;
4831                 break;
4832         }
4833
4834         case DIF_SUBR_JSON: {
4835                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4836                 uintptr_t json = tupregs[0].dttk_value;
4837                 size_t jsonlen = dtrace_strlen((char *)json, size);
4838                 uintptr_t elem = tupregs[1].dttk_value;
4839                 size_t elemlen = dtrace_strlen((char *)elem, size);
4840
4841                 char *dest = (char *)mstate->dtms_scratch_ptr;
4842                 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
4843                 char *ee = elemlist;
4844                 int nelems = 1;
4845                 uintptr_t cur;
4846
4847                 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
4848                     !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
4849                         regs[rd] = 0;
4850                         break;
4851                 }
4852
4853                 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
4854                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4855                         regs[rd] = 0;
4856                         break;
4857                 }
4858
4859                 /*
4860                  * Read the element selector and split it up into a packed list
4861                  * of strings.
4862                  */
4863                 for (cur = elem; cur < elem + elemlen; cur++) {
4864                         char cc = dtrace_load8(cur);
4865
4866                         if (cur == elem && cc == '[') {
4867                                 /*
4868                                  * If the first element selector key is
4869                                  * actually an array index then ignore the
4870                                  * bracket.
4871                                  */
4872                                 continue;
4873                         }
4874
4875                         if (cc == ']')
4876                                 continue;
4877
4878                         if (cc == '.' || cc == '[') {
4879                                 nelems++;
4880                                 cc = '\0';
4881                         }
4882
4883                         *ee++ = cc;
4884                 }
4885                 *ee++ = '\0';
4886
4887                 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
4888                     nelems, dest)) != 0)
4889                         mstate->dtms_scratch_ptr += jsonlen + 1;
4890                 break;
4891         }
4892
4893         case DIF_SUBR_TOUPPER:
4894         case DIF_SUBR_TOLOWER: {
4895                 uintptr_t s = tupregs[0].dttk_value;
4896                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4897                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4898                 size_t len = dtrace_strlen((char *)s, size);
4899                 char lower, upper, convert;
4900                 int64_t i;
4901
4902                 if (subr == DIF_SUBR_TOUPPER) {
4903                         lower = 'a';
4904                         upper = 'z';
4905                         convert = 'A';
4906                 } else {
4907                         lower = 'A';
4908                         upper = 'Z';
4909                         convert = 'a';
4910                 }
4911
4912                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4913                         regs[rd] = 0;
4914                         break;
4915                 }
4916
4917                 if (!DTRACE_INSCRATCH(mstate, size)) {
4918                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4919                         regs[rd] = 0;
4920                         break;
4921                 }
4922
4923                 for (i = 0; i < size - 1; i++) {
4924                         if ((c = dtrace_load8(s + i)) == '\0')
4925                                 break;
4926
4927                         if (c >= lower && c <= upper)
4928                                 c = convert + (c - lower);
4929
4930                         dest[i] = c;
4931                 }
4932
4933                 ASSERT(i < size);
4934                 dest[i] = '\0';
4935                 regs[rd] = (uintptr_t)dest;
4936                 mstate->dtms_scratch_ptr += size;
4937                 break;
4938         }
4939
4940 #if defined(sun)
4941         case DIF_SUBR_GETMAJOR:
4942 #ifdef _LP64
4943                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4944 #else
4945                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4946 #endif
4947                 break;
4948
4949         case DIF_SUBR_GETMINOR:
4950 #ifdef _LP64
4951                 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4952 #else
4953                 regs[rd] = tupregs[0].dttk_value & MAXMIN;
4954 #endif
4955                 break;
4956
4957         case DIF_SUBR_DDI_PATHNAME: {
4958                 /*
4959                  * This one is a galactic mess.  We are going to roughly
4960                  * emulate ddi_pathname(), but it's made more complicated
4961                  * by the fact that we (a) want to include the minor name and
4962                  * (b) must proceed iteratively instead of recursively.
4963                  */
4964                 uintptr_t dest = mstate->dtms_scratch_ptr;
4965                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4966                 char *start = (char *)dest, *end = start + size - 1;
4967                 uintptr_t daddr = tupregs[0].dttk_value;
4968                 int64_t minor = (int64_t)tupregs[1].dttk_value;
4969                 char *s;
4970                 int i, len, depth = 0;
4971
4972                 /*
4973                  * Due to all the pointer jumping we do and context we must
4974                  * rely upon, we just mandate that the user must have kernel
4975                  * read privileges to use this routine.
4976                  */
4977                 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4978                         *flags |= CPU_DTRACE_KPRIV;
4979                         *illval = daddr;
4980                         regs[rd] = 0;
4981                 }
4982
4983                 if (!DTRACE_INSCRATCH(mstate, size)) {
4984                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4985                         regs[rd] = 0;
4986                         break;
4987                 }
4988
4989                 *end = '\0';
4990
4991                 /*
4992                  * We want to have a name for the minor.  In order to do this,
4993                  * we need to walk the minor list from the devinfo.  We want
4994                  * to be sure that we don't infinitely walk a circular list,
4995                  * so we check for circularity by sending a scout pointer
4996                  * ahead two elements for every element that we iterate over;
4997                  * if the list is circular, these will ultimately point to the
4998                  * same element.  You may recognize this little trick as the
4999                  * answer to a stupid interview question -- one that always
5000                  * seems to be asked by those who had to have it laboriously
5001                  * explained to them, and who can't even concisely describe
5002                  * the conditions under which one would be forced to resort to
5003                  * this technique.  Needless to say, those conditions are
5004                  * found here -- and probably only here.  Is this the only use
5005                  * of this infamous trick in shipping, production code?  If it
5006                  * isn't, it probably should be...
5007                  */
5008                 if (minor != -1) {
5009                         uintptr_t maddr = dtrace_loadptr(daddr +
5010                             offsetof(struct dev_info, devi_minor));
5011
5012                         uintptr_t next = offsetof(struct ddi_minor_data, next);
5013                         uintptr_t name = offsetof(struct ddi_minor_data,
5014                             d_minor) + offsetof(struct ddi_minor, name);
5015                         uintptr_t dev = offsetof(struct ddi_minor_data,
5016                             d_minor) + offsetof(struct ddi_minor, dev);
5017                         uintptr_t scout;
5018
5019                         if (maddr != NULL)
5020                                 scout = dtrace_loadptr(maddr + next);
5021
5022                         while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5023                                 uint64_t m;
5024 #ifdef _LP64
5025                                 m = dtrace_load64(maddr + dev) & MAXMIN64;
5026 #else
5027                                 m = dtrace_load32(maddr + dev) & MAXMIN;
5028 #endif
5029                                 if (m != minor) {
5030                                         maddr = dtrace_loadptr(maddr + next);
5031
5032                                         if (scout == NULL)
5033                                                 continue;
5034
5035                                         scout = dtrace_loadptr(scout + next);
5036
5037                                         if (scout == NULL)
5038                                                 continue;
5039
5040                                         scout = dtrace_loadptr(scout + next);
5041
5042                                         if (scout == NULL)
5043                                                 continue;
5044
5045                                         if (scout == maddr) {
5046                                                 *flags |= CPU_DTRACE_ILLOP;
5047                                                 break;
5048                                         }
5049
5050                                         continue;
5051                                 }
5052
5053                                 /*
5054                                  * We have the minor data.  Now we need to
5055                                  * copy the minor's name into the end of the
5056                                  * pathname.
5057                                  */
5058                                 s = (char *)dtrace_loadptr(maddr + name);
5059                                 len = dtrace_strlen(s, size);
5060
5061                                 if (*flags & CPU_DTRACE_FAULT)
5062                                         break;
5063
5064                                 if (len != 0) {
5065                                         if ((end -= (len + 1)) < start)
5066                                                 break;
5067
5068                                         *end = ':';
5069                                 }
5070
5071                                 for (i = 1; i <= len; i++)
5072                                         end[i] = dtrace_load8((uintptr_t)s++);
5073                                 break;
5074                         }
5075                 }
5076
5077                 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5078                         ddi_node_state_t devi_state;
5079
5080                         devi_state = dtrace_load32(daddr +
5081                             offsetof(struct dev_info, devi_node_state));
5082
5083                         if (*flags & CPU_DTRACE_FAULT)
5084                                 break;
5085
5086                         if (devi_state >= DS_INITIALIZED) {
5087                                 s = (char *)dtrace_loadptr(daddr +
5088                                     offsetof(struct dev_info, devi_addr));
5089                                 len = dtrace_strlen(s, size);
5090
5091                                 if (*flags & CPU_DTRACE_FAULT)
5092                                         break;
5093
5094                                 if (len != 0) {
5095                                         if ((end -= (len + 1)) < start)
5096                                                 break;
5097
5098                                         *end = '@';
5099                                 }
5100
5101                                 for (i = 1; i <= len; i++)
5102                                         end[i] = dtrace_load8((uintptr_t)s++);
5103                         }
5104
5105                         /*
5106                          * Now for the node name...
5107                          */
5108                         s = (char *)dtrace_loadptr(daddr +
5109                             offsetof(struct dev_info, devi_node_name));
5110
5111                         daddr = dtrace_loadptr(daddr +
5112                             offsetof(struct dev_info, devi_parent));
5113
5114                         /*
5115                          * If our parent is NULL (that is, if we're the root
5116                          * node), we're going to use the special path
5117                          * "devices".
5118                          */
5119                         if (daddr == 0)
5120                                 s = "devices";
5121
5122                         len = dtrace_strlen(s, size);
5123                         if (*flags & CPU_DTRACE_FAULT)
5124                                 break;
5125
5126                         if ((end -= (len + 1)) < start)
5127                                 break;
5128
5129                         for (i = 1; i <= len; i++)
5130                                 end[i] = dtrace_load8((uintptr_t)s++);
5131                         *end = '/';
5132
5133                         if (depth++ > dtrace_devdepth_max) {
5134                                 *flags |= CPU_DTRACE_ILLOP;
5135                                 break;
5136                         }
5137                 }
5138
5139                 if (end < start)
5140                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5141
5142                 if (daddr == 0) {
5143                         regs[rd] = (uintptr_t)end;
5144                         mstate->dtms_scratch_ptr += size;
5145                 }
5146
5147                 break;
5148         }
5149 #endif
5150
5151         case DIF_SUBR_STRJOIN: {
5152                 char *d = (char *)mstate->dtms_scratch_ptr;
5153                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5154                 uintptr_t s1 = tupregs[0].dttk_value;
5155                 uintptr_t s2 = tupregs[1].dttk_value;
5156                 int i = 0;
5157
5158                 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
5159                     !dtrace_strcanload(s2, size, mstate, vstate)) {
5160                         regs[rd] = 0;
5161                         break;
5162                 }
5163
5164                 if (!DTRACE_INSCRATCH(mstate, size)) {
5165                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5166                         regs[rd] = 0;
5167                         break;
5168                 }
5169
5170                 for (;;) {
5171                         if (i >= size) {
5172                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5173                                 regs[rd] = 0;
5174                                 break;
5175                         }
5176
5177                         if ((d[i++] = dtrace_load8(s1++)) == '\0') {
5178                                 i--;
5179                                 break;
5180                         }
5181                 }
5182
5183                 for (;;) {
5184                         if (i >= size) {
5185                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5186                                 regs[rd] = 0;
5187                                 break;
5188                         }
5189
5190                         if ((d[i++] = dtrace_load8(s2++)) == '\0')
5191                                 break;
5192                 }
5193
5194                 if (i < size) {
5195                         mstate->dtms_scratch_ptr += i;
5196                         regs[rd] = (uintptr_t)d;
5197                 }
5198
5199                 break;
5200         }
5201
5202         case DIF_SUBR_STRTOLL: {
5203                 uintptr_t s = tupregs[0].dttk_value;
5204                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5205                 int base = 10;
5206
5207                 if (nargs > 1) {
5208                         if ((base = tupregs[1].dttk_value) <= 1 ||
5209                             base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5210                                 *flags |= CPU_DTRACE_ILLOP;
5211                                 break;
5212                         }
5213                 }
5214
5215                 if (!dtrace_strcanload(s, size, mstate, vstate)) {
5216                         regs[rd] = INT64_MIN;
5217                         break;
5218                 }
5219
5220                 regs[rd] = dtrace_strtoll((char *)s, base, size);
5221                 break;
5222         }
5223
5224         case DIF_SUBR_LLTOSTR: {
5225                 int64_t i = (int64_t)tupregs[0].dttk_value;
5226                 uint64_t val, digit;
5227                 uint64_t size = 65;     /* enough room for 2^64 in binary */
5228                 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
5229                 int base = 10;
5230
5231                 if (nargs > 1) {
5232                         if ((base = tupregs[1].dttk_value) <= 1 ||
5233                             base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5234                                 *flags |= CPU_DTRACE_ILLOP;
5235                                 break;
5236                         }
5237                 }
5238
5239                 val = (base == 10 && i < 0) ? i * -1 : i;
5240
5241                 if (!DTRACE_INSCRATCH(mstate, size)) {
5242                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5243                         regs[rd] = 0;
5244                         break;
5245                 }
5246
5247                 for (*end-- = '\0'; val; val /= base) {
5248                         if ((digit = val % base) <= '9' - '0') {
5249                                 *end-- = '0' + digit;
5250                         } else {
5251                                 *end-- = 'a' + (digit - ('9' - '0') - 1);
5252                         }
5253                 }
5254
5255                 if (i == 0 && base == 16)
5256                         *end-- = '0';
5257
5258                 if (base == 16)
5259                         *end-- = 'x';
5260
5261                 if (i == 0 || base == 8 || base == 16)
5262                         *end-- = '0';
5263
5264                 if (i < 0 && base == 10)
5265                         *end-- = '-';
5266
5267                 regs[rd] = (uintptr_t)end + 1;
5268                 mstate->dtms_scratch_ptr += size;
5269                 break;
5270         }
5271
5272         case DIF_SUBR_HTONS:
5273         case DIF_SUBR_NTOHS:
5274 #if BYTE_ORDER == BIG_ENDIAN
5275                 regs[rd] = (uint16_t)tupregs[0].dttk_value;
5276 #else
5277                 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5278 #endif
5279                 break;
5280
5281
5282         case DIF_SUBR_HTONL:
5283         case DIF_SUBR_NTOHL:
5284 #if BYTE_ORDER == BIG_ENDIAN
5285                 regs[rd] = (uint32_t)tupregs[0].dttk_value;
5286 #else
5287                 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5288 #endif
5289                 break;
5290
5291
5292         case DIF_SUBR_HTONLL:
5293         case DIF_SUBR_NTOHLL:
5294 #if BYTE_ORDER == BIG_ENDIAN
5295                 regs[rd] = (uint64_t)tupregs[0].dttk_value;
5296 #else
5297                 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5298 #endif
5299                 break;
5300
5301
5302         case DIF_SUBR_DIRNAME:
5303         case DIF_SUBR_BASENAME: {
5304                 char *dest = (char *)mstate->dtms_scratch_ptr;
5305                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5306                 uintptr_t src = tupregs[0].dttk_value;
5307                 int i, j, len = dtrace_strlen((char *)src, size);
5308                 int lastbase = -1, firstbase = -1, lastdir = -1;
5309                 int start, end;
5310
5311                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5312                         regs[rd] = 0;
5313                         break;
5314                 }
5315
5316                 if (!DTRACE_INSCRATCH(mstate, size)) {
5317                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5318                         regs[rd] = 0;
5319                         break;
5320                 }
5321
5322                 /*
5323                  * The basename and dirname for a zero-length string is
5324                  * defined to be "."
5325                  */
5326                 if (len == 0) {
5327                         len = 1;
5328                         src = (uintptr_t)".";
5329                 }
5330
5331                 /*
5332                  * Start from the back of the string, moving back toward the
5333                  * front until we see a character that isn't a slash.  That
5334                  * character is the last character in the basename.
5335                  */
5336                 for (i = len - 1; i >= 0; i--) {
5337                         if (dtrace_load8(src + i) != '/')
5338                                 break;
5339                 }
5340
5341                 if (i >= 0)
5342                         lastbase = i;
5343
5344                 /*
5345                  * Starting from the last character in the basename, move
5346                  * towards the front until we find a slash.  The character
5347                  * that we processed immediately before that is the first
5348                  * character in the basename.
5349                  */
5350                 for (; i >= 0; i--) {
5351                         if (dtrace_load8(src + i) == '/')
5352                                 break;
5353                 }
5354
5355                 if (i >= 0)
5356                         firstbase = i + 1;
5357
5358                 /*
5359                  * Now keep going until we find a non-slash character.  That
5360                  * character is the last character in the dirname.
5361                  */
5362                 for (; i >= 0; i--) {
5363                         if (dtrace_load8(src + i) != '/')
5364                                 break;
5365                 }
5366
5367                 if (i >= 0)
5368                         lastdir = i;
5369
5370                 ASSERT(!(lastbase == -1 && firstbase != -1));
5371                 ASSERT(!(firstbase == -1 && lastdir != -1));
5372
5373                 if (lastbase == -1) {
5374                         /*
5375                          * We didn't find a non-slash character.  We know that
5376                          * the length is non-zero, so the whole string must be
5377                          * slashes.  In either the dirname or the basename
5378                          * case, we return '/'.
5379                          */
5380                         ASSERT(firstbase == -1);
5381                         firstbase = lastbase = lastdir = 0;
5382                 }
5383
5384                 if (firstbase == -1) {
5385                         /*
5386                          * The entire string consists only of a basename
5387                          * component.  If we're looking for dirname, we need
5388                          * to change our string to be just "."; if we're
5389                          * looking for a basename, we'll just set the first
5390                          * character of the basename to be 0.
5391                          */
5392                         if (subr == DIF_SUBR_DIRNAME) {
5393                                 ASSERT(lastdir == -1);
5394                                 src = (uintptr_t)".";
5395                                 lastdir = 0;
5396                         } else {
5397                                 firstbase = 0;
5398                         }
5399                 }
5400
5401                 if (subr == DIF_SUBR_DIRNAME) {
5402                         if (lastdir == -1) {
5403                                 /*
5404                                  * We know that we have a slash in the name --
5405                                  * or lastdir would be set to 0, above.  And
5406                                  * because lastdir is -1, we know that this
5407                                  * slash must be the first character.  (That
5408                                  * is, the full string must be of the form
5409                                  * "/basename".)  In this case, the last
5410                                  * character of the directory name is 0.
5411                                  */
5412                                 lastdir = 0;
5413                         }
5414
5415                         start = 0;
5416                         end = lastdir;
5417                 } else {
5418                         ASSERT(subr == DIF_SUBR_BASENAME);
5419                         ASSERT(firstbase != -1 && lastbase != -1);
5420                         start = firstbase;
5421                         end = lastbase;
5422                 }
5423
5424                 for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
5425                         dest[j] = dtrace_load8(src + i);
5426
5427                 dest[j] = '\0';
5428                 regs[rd] = (uintptr_t)dest;
5429                 mstate->dtms_scratch_ptr += size;
5430                 break;
5431         }
5432
5433         case DIF_SUBR_GETF: {
5434                 uintptr_t fd = tupregs[0].dttk_value;
5435                 struct filedesc *fdp;
5436                 file_t *fp;
5437
5438                 if (!dtrace_priv_proc(state)) {
5439                         regs[rd] = 0;
5440                         break;
5441                 }
5442                 fdp = curproc->p_fd;
5443                 FILEDESC_SLOCK(fdp);
5444                 fp = fget_locked(fdp, fd);
5445                 mstate->dtms_getf = fp;
5446                 regs[rd] = (uintptr_t)fp;
5447                 FILEDESC_SUNLOCK(fdp);
5448                 break;
5449         }
5450
5451         case DIF_SUBR_CLEANPATH: {
5452                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5453                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5454                 uintptr_t src = tupregs[0].dttk_value;
5455                 int i = 0, j = 0;
5456 #if defined(sun)
5457                 zone_t *z;
5458 #endif
5459
5460                 if (!dtrace_strcanload(src, size, mstate, vstate)) {
5461                         regs[rd] = 0;
5462                         break;
5463                 }
5464
5465                 if (!DTRACE_INSCRATCH(mstate, size)) {
5466                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5467                         regs[rd] = 0;
5468                         break;
5469                 }
5470
5471                 /*
5472                  * Move forward, loading each character.
5473                  */
5474                 do {
5475                         c = dtrace_load8(src + i++);
5476 next:
5477                         if (j + 5 >= size)      /* 5 = strlen("/..c\0") */
5478                                 break;
5479
5480                         if (c != '/') {
5481                                 dest[j++] = c;
5482                                 continue;
5483                         }
5484
5485                         c = dtrace_load8(src + i++);
5486
5487                         if (c == '/') {
5488                                 /*
5489                                  * We have two slashes -- we can just advance
5490                                  * to the next character.
5491                                  */
5492                                 goto next;
5493                         }
5494
5495                         if (c != '.') {
5496                                 /*
5497                                  * This is not "." and it's not ".." -- we can
5498                                  * just store the "/" and this character and
5499                                  * drive on.
5500                                  */
5501                                 dest[j++] = '/';
5502                                 dest[j++] = c;
5503                                 continue;
5504                         }
5505
5506                         c = dtrace_load8(src + i++);
5507
5508                         if (c == '/') {
5509                                 /*
5510                                  * This is a "/./" component.  We're not going
5511                                  * to store anything in the destination buffer;
5512                                  * we're just going to go to the next component.
5513                                  */
5514                                 goto next;
5515                         }
5516
5517                         if (c != '.') {
5518                                 /*
5519                                  * This is not ".." -- we can just store the
5520                                  * "/." and this character and continue
5521                                  * processing.
5522                                  */
5523                                 dest[j++] = '/';
5524                                 dest[j++] = '.';
5525                                 dest[j++] = c;
5526                                 continue;
5527                         }
5528
5529                         c = dtrace_load8(src + i++);
5530
5531                         if (c != '/' && c != '\0') {
5532                                 /*
5533                                  * This is not ".." -- it's "..[mumble]".
5534                                  * We'll store the "/.." and this character
5535                                  * and continue processing.
5536                                  */
5537                                 dest[j++] = '/';
5538                                 dest[j++] = '.';
5539                                 dest[j++] = '.';
5540                                 dest[j++] = c;
5541                                 continue;
5542                         }
5543
5544                         /*
5545                          * This is "/../" or "/..\0".  We need to back up
5546                          * our destination pointer until we find a "/".
5547                          */
5548                         i--;
5549                         while (j != 0 && dest[--j] != '/')
5550                                 continue;
5551
5552                         if (c == '\0')
5553                                 dest[++j] = '/';
5554                 } while (c != '\0');
5555
5556                 dest[j] = '\0';
5557
5558 #if defined(sun)
5559                 if (mstate->dtms_getf != NULL &&
5560                     !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
5561                     (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
5562                         /*
5563                          * If we've done a getf() as a part of this ECB and we
5564                          * don't have kernel access (and we're not in the global
5565                          * zone), check if the path we cleaned up begins with
5566                          * the zone's root path, and trim it off if so.  Note
5567                          * that this is an output cleanliness issue, not a
5568                          * security issue: knowing one's zone root path does
5569                          * not enable privilege escalation.
5570                          */
5571                         if (strstr(dest, z->zone_rootpath) == dest)
5572                                 dest += strlen(z->zone_rootpath) - 1;
5573                 }
5574 #endif
5575
5576                 regs[rd] = (uintptr_t)dest;
5577                 mstate->dtms_scratch_ptr += size;
5578                 break;
5579         }
5580
5581         case DIF_SUBR_INET_NTOA:
5582         case DIF_SUBR_INET_NTOA6:
5583         case DIF_SUBR_INET_NTOP: {
5584                 size_t size;
5585                 int af, argi, i;
5586                 char *base, *end;
5587
5588                 if (subr == DIF_SUBR_INET_NTOP) {
5589                         af = (int)tupregs[0].dttk_value;
5590                         argi = 1;
5591                 } else {
5592                         af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5593                         argi = 0;
5594                 }
5595
5596                 if (af == AF_INET) {
5597                         ipaddr_t ip4;
5598                         uint8_t *ptr8, val;
5599
5600                         /*
5601                          * Safely load the IPv4 address.
5602                          */
5603                         ip4 = dtrace_load32(tupregs[argi].dttk_value);
5604
5605                         /*
5606                          * Check an IPv4 string will fit in scratch.
5607                          */
5608                         size = INET_ADDRSTRLEN;
5609                         if (!DTRACE_INSCRATCH(mstate, size)) {
5610                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5611                                 regs[rd] = 0;
5612                                 break;
5613                         }
5614                         base = (char *)mstate->dtms_scratch_ptr;
5615                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
5616
5617                         /*
5618                          * Stringify as a dotted decimal quad.
5619                          */
5620                         *end-- = '\0';
5621                         ptr8 = (uint8_t *)&ip4;
5622                         for (i = 3; i >= 0; i--) {
5623                                 val = ptr8[i];
5624
5625                                 if (val == 0) {
5626                                         *end-- = '0';
5627                                 } else {
5628                                         for (; val; val /= 10) {
5629                                                 *end-- = '0' + (val % 10);
5630                                         }
5631                                 }
5632
5633                                 if (i > 0)
5634                                         *end-- = '.';
5635                         }
5636                         ASSERT(end + 1 >= base);
5637
5638                 } else if (af == AF_INET6) {
5639                         struct in6_addr ip6;
5640                         int firstzero, tryzero, numzero, v6end;
5641                         uint16_t val;
5642                         const char digits[] = "0123456789abcdef";
5643
5644                         /*
5645                          * Stringify using RFC 1884 convention 2 - 16 bit
5646                          * hexadecimal values with a zero-run compression.
5647                          * Lower case hexadecimal digits are used.
5648                          *      eg, fe80::214:4fff:fe0b:76c8.
5649                          * The IPv4 embedded form is returned for inet_ntop,
5650                          * just the IPv4 string is returned for inet_ntoa6.
5651                          */
5652
5653                         /*
5654                          * Safely load the IPv6 address.
5655                          */
5656                         dtrace_bcopy(
5657                             (void *)(uintptr_t)tupregs[argi].dttk_value,
5658                             (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5659
5660                         /*
5661                          * Check an IPv6 string will fit in scratch.
5662                          */
5663                         size = INET6_ADDRSTRLEN;
5664                         if (!DTRACE_INSCRATCH(mstate, size)) {
5665                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5666                                 regs[rd] = 0;
5667                                 break;
5668                         }
5669                         base = (char *)mstate->dtms_scratch_ptr;
5670                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
5671                         *end-- = '\0';
5672
5673                         /*
5674                          * Find the longest run of 16 bit zero values
5675                          * for the single allowed zero compression - "::".
5676                          */
5677                         firstzero = -1;
5678                         tryzero = -1;
5679                         numzero = 1;
5680                         for (i = 0; i < sizeof (struct in6_addr); i++) {
5681 #if defined(sun)
5682                                 if (ip6._S6_un._S6_u8[i] == 0 &&
5683 #else
5684                                 if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5685 #endif
5686                                     tryzero == -1 && i % 2 == 0) {
5687                                         tryzero = i;
5688                                         continue;
5689                                 }
5690
5691                                 if (tryzero != -1 &&
5692 #if defined(sun)
5693                                     (ip6._S6_un._S6_u8[i] != 0 ||
5694 #else
5695                                     (ip6.__u6_addr.__u6_addr8[i] != 0 ||
5696 #endif
5697                                     i == sizeof (struct in6_addr) - 1)) {
5698
5699                                         if (i - tryzero <= numzero) {
5700                                                 tryzero = -1;
5701                                                 continue;
5702                                         }
5703
5704                                         firstzero = tryzero;
5705                                         numzero = i - i % 2 - tryzero;
5706                                         tryzero = -1;
5707
5708 #if defined(sun)
5709                                         if (ip6._S6_un._S6_u8[i] == 0 &&
5710 #else
5711                                         if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5712 #endif
5713                                             i == sizeof (struct in6_addr) - 1)
5714                                                 numzero += 2;
5715                                 }
5716                         }
5717                         ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5718
5719                         /*
5720                          * Check for an IPv4 embedded address.
5721                          */
5722                         v6end = sizeof (struct in6_addr) - 2;
5723                         if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5724                             IN6_IS_ADDR_V4COMPAT(&ip6)) {
5725                                 for (i = sizeof (struct in6_addr) - 1;
5726                                     i >= DTRACE_V4MAPPED_OFFSET; i--) {
5727                                         ASSERT(end >= base);
5728
5729 #if defined(sun)
5730                                         val = ip6._S6_un._S6_u8[i];
5731 #else
5732                                         val = ip6.__u6_addr.__u6_addr8[i];
5733 #endif
5734
5735                                         if (val == 0) {
5736                                                 *end-- = '0';
5737                                         } else {
5738                                                 for (; val; val /= 10) {
5739                                                         *end-- = '0' + val % 10;
5740                                                 }
5741                                         }
5742
5743                                         if (i > DTRACE_V4MAPPED_OFFSET)
5744                                                 *end-- = '.';
5745                                 }
5746
5747                                 if (subr == DIF_SUBR_INET_NTOA6)
5748                                         goto inetout;
5749
5750                                 /*
5751                                  * Set v6end to skip the IPv4 address that
5752                                  * we have already stringified.
5753                                  */
5754                                 v6end = 10;
5755                         }
5756
5757                         /*
5758                          * Build the IPv6 string by working through the
5759                          * address in reverse.
5760                          */
5761                         for (i = v6end; i >= 0; i -= 2) {
5762                                 ASSERT(end >= base);
5763
5764                                 if (i == firstzero + numzero - 2) {
5765                                         *end-- = ':';
5766                                         *end-- = ':';
5767                                         i -= numzero - 2;
5768                                         continue;
5769                                 }
5770
5771                                 if (i < 14 && i != firstzero - 2)
5772                                         *end-- = ':';
5773
5774 #if defined(sun)
5775                                 val = (ip6._S6_un._S6_u8[i] << 8) +
5776                                     ip6._S6_un._S6_u8[i + 1];
5777 #else
5778                                 val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
5779                                     ip6.__u6_addr.__u6_addr8[i + 1];
5780 #endif
5781
5782                                 if (val == 0) {
5783                                         *end-- = '0';
5784                                 } else {
5785                                         for (; val; val /= 16) {
5786                                                 *end-- = digits[val % 16];
5787                                         }
5788                                 }
5789                         }
5790                         ASSERT(end + 1 >= base);
5791
5792                 } else {
5793                         /*
5794                          * The user didn't use AH_INET or AH_INET6.
5795                          */
5796                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5797                         regs[rd] = 0;
5798                         break;
5799                 }
5800
5801 inetout:        regs[rd] = (uintptr_t)end + 1;
5802                 mstate->dtms_scratch_ptr += size;
5803                 break;
5804         }
5805
5806         case DIF_SUBR_MEMREF: {
5807                 uintptr_t size = 2 * sizeof(uintptr_t);
5808                 uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5809                 size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
5810
5811                 /* address and length */
5812                 memref[0] = tupregs[0].dttk_value;
5813                 memref[1] = tupregs[1].dttk_value;
5814
5815                 regs[rd] = (uintptr_t) memref;
5816                 mstate->dtms_scratch_ptr += scratch_size;
5817                 break;
5818         }
5819
5820 #if !defined(sun)
5821         case DIF_SUBR_MEMSTR: {
5822                 char *str = (char *)mstate->dtms_scratch_ptr;
5823                 uintptr_t mem = tupregs[0].dttk_value;
5824                 char c = tupregs[1].dttk_value;
5825                 size_t size = tupregs[2].dttk_value;
5826                 uint8_t n;
5827                 int i;
5828
5829                 regs[rd] = 0;
5830
5831                 if (size == 0)
5832                         break;
5833
5834                 if (!dtrace_canload(mem, size - 1, mstate, vstate))
5835                         break;
5836
5837                 if (!DTRACE_INSCRATCH(mstate, size)) {
5838                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5839                         break;
5840                 }
5841
5842                 if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
5843                         *flags |= CPU_DTRACE_ILLOP;
5844                         break;
5845                 }
5846
5847                 for (i = 0; i < size - 1; i++) {
5848                         n = dtrace_load8(mem++);
5849                         str[i] = (n == 0) ? c : n;
5850                 }
5851                 str[size - 1] = 0;
5852
5853                 regs[rd] = (uintptr_t)str;
5854                 mstate->dtms_scratch_ptr += size;
5855                 break;
5856         }
5857 #endif
5858
5859         case DIF_SUBR_TYPEREF: {
5860                 uintptr_t size = 4 * sizeof(uintptr_t);
5861                 uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5862                 size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
5863
5864                 /* address, num_elements, type_str, type_len */
5865                 typeref[0] = tupregs[0].dttk_value;
5866                 typeref[1] = tupregs[1].dttk_value;
5867                 typeref[2] = tupregs[2].dttk_value;
5868                 typeref[3] = tupregs[3].dttk_value;
5869
5870                 regs[rd] = (uintptr_t) typeref;
5871                 mstate->dtms_scratch_ptr += scratch_size;
5872                 break;
5873         }
5874         }
5875 }
5876
5877 /*
5878  * Emulate the execution of DTrace IR instructions specified by the given
5879  * DIF object.  This function is deliberately void of assertions as all of
5880  * the necessary checks are handled by a call to dtrace_difo_validate().
5881  */
5882 static uint64_t
5883 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5884     dtrace_vstate_t *vstate, dtrace_state_t *state)
5885 {
5886         const dif_instr_t *text = difo->dtdo_buf;
5887         const uint_t textlen = difo->dtdo_len;
5888         const char *strtab = difo->dtdo_strtab;
5889         const uint64_t *inttab = difo->dtdo_inttab;
5890
5891         uint64_t rval = 0;
5892         dtrace_statvar_t *svar;
5893         dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5894         dtrace_difv_t *v;
5895         volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
5896         volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
5897
5898         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5899         uint64_t regs[DIF_DIR_NREGS];
5900         uint64_t *tmp;
5901
5902         uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5903         int64_t cc_r;
5904         uint_t pc = 0, id, opc = 0;
5905         uint8_t ttop = 0;
5906         dif_instr_t instr;
5907         uint_t r1, r2, rd;
5908
5909         /*
5910          * We stash the current DIF object into the machine state: we need it
5911          * for subsequent access checking.
5912          */
5913         mstate->dtms_difo = difo;
5914
5915         regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
5916
5917         while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5918                 opc = pc;
5919
5920                 instr = text[pc++];
5921                 r1 = DIF_INSTR_R1(instr);
5922                 r2 = DIF_INSTR_R2(instr);
5923                 rd = DIF_INSTR_RD(instr);
5924
5925                 switch (DIF_INSTR_OP(instr)) {
5926                 case DIF_OP_OR:
5927                         regs[rd] = regs[r1] | regs[r2];
5928                         break;
5929                 case DIF_OP_XOR:
5930                         regs[rd] = regs[r1] ^ regs[r2];
5931                         break;
5932                 case DIF_OP_AND:
5933                         regs[rd] = regs[r1] & regs[r2];
5934                         break;
5935                 case DIF_OP_SLL:
5936                         regs[rd] = regs[r1] << regs[r2];
5937                         break;
5938                 case DIF_OP_SRL:
5939                         regs[rd] = regs[r1] >> regs[r2];
5940                         break;
5941                 case DIF_OP_SUB:
5942                         regs[rd] = regs[r1] - regs[r2];
5943                         break;
5944                 case DIF_OP_ADD:
5945                         regs[rd] = regs[r1] + regs[r2];
5946                         break;
5947                 case DIF_OP_MUL:
5948                         regs[rd] = regs[r1] * regs[r2];
5949                         break;
5950                 case DIF_OP_SDIV:
5951                         if (regs[r2] == 0) {
5952                                 regs[rd] = 0;
5953                                 *flags |= CPU_DTRACE_DIVZERO;
5954                         } else {
5955                                 regs[rd] = (int64_t)regs[r1] /
5956                                     (int64_t)regs[r2];
5957                         }
5958                         break;
5959
5960                 case DIF_OP_UDIV:
5961                         if (regs[r2] == 0) {
5962                                 regs[rd] = 0;
5963                                 *flags |= CPU_DTRACE_DIVZERO;
5964                         } else {
5965                                 regs[rd] = regs[r1] / regs[r2];
5966                         }
5967                         break;
5968
5969                 case DIF_OP_SREM:
5970                         if (regs[r2] == 0) {
5971                                 regs[rd] = 0;
5972                                 *flags |= CPU_DTRACE_DIVZERO;
5973                         } else {
5974                                 regs[rd] = (int64_t)regs[r1] %
5975                                     (int64_t)regs[r2];
5976                         }
5977                         break;
5978
5979                 case DIF_OP_UREM:
5980                         if (regs[r2] == 0) {
5981                                 regs[rd] = 0;
5982                                 *flags |= CPU_DTRACE_DIVZERO;
5983                         } else {
5984                                 regs[rd] = regs[r1] % regs[r2];
5985                         }
5986                         break;
5987
5988                 case DIF_OP_NOT:
5989                         regs[rd] = ~regs[r1];
5990                         break;
5991                 case DIF_OP_MOV:
5992                         regs[rd] = regs[r1];
5993                         break;
5994                 case DIF_OP_CMP:
5995                         cc_r = regs[r1] - regs[r2];
5996                         cc_n = cc_r < 0;
5997                         cc_z = cc_r == 0;
5998                         cc_v = 0;
5999                         cc_c = regs[r1] < regs[r2];
6000                         break;
6001                 case DIF_OP_TST:
6002                         cc_n = cc_v = cc_c = 0;
6003                         cc_z = regs[r1] == 0;
6004                         break;
6005                 case DIF_OP_BA:
6006                         pc = DIF_INSTR_LABEL(instr);
6007                         break;
6008                 case DIF_OP_BE:
6009                         if (cc_z)
6010                                 pc = DIF_INSTR_LABEL(instr);
6011                         break;
6012                 case DIF_OP_BNE:
6013                         if (cc_z == 0)
6014                                 pc = DIF_INSTR_LABEL(instr);
6015                         break;
6016                 case DIF_OP_BG:
6017                         if ((cc_z | (cc_n ^ cc_v)) == 0)
6018                                 pc = DIF_INSTR_LABEL(instr);
6019                         break;
6020                 case DIF_OP_BGU:
6021                         if ((cc_c | cc_z) == 0)
6022                                 pc = DIF_INSTR_LABEL(instr);
6023                         break;
6024                 case DIF_OP_BGE:
6025                         if ((cc_n ^ cc_v) == 0)
6026                                 pc = DIF_INSTR_LABEL(instr);
6027                         break;
6028                 case DIF_OP_BGEU:
6029                         if (cc_c == 0)
6030                                 pc = DIF_INSTR_LABEL(instr);
6031                         break;
6032                 case DIF_OP_BL:
6033                         if (cc_n ^ cc_v)
6034                                 pc = DIF_INSTR_LABEL(instr);
6035                         break;
6036                 case DIF_OP_BLU:
6037                         if (cc_c)
6038                                 pc = DIF_INSTR_LABEL(instr);
6039                         break;
6040                 case DIF_OP_BLE:
6041                         if (cc_z | (cc_n ^ cc_v))
6042                                 pc = DIF_INSTR_LABEL(instr);
6043                         break;
6044                 case DIF_OP_BLEU:
6045                         if (cc_c | cc_z)
6046                                 pc = DIF_INSTR_LABEL(instr);
6047                         break;
6048                 case DIF_OP_RLDSB:
6049                         if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6050                                 break;
6051                         /*FALLTHROUGH*/
6052                 case DIF_OP_LDSB:
6053                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
6054                         break;
6055                 case DIF_OP_RLDSH:
6056                         if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6057                                 break;
6058                         /*FALLTHROUGH*/
6059                 case DIF_OP_LDSH:
6060                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
6061                         break;
6062                 case DIF_OP_RLDSW:
6063                         if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6064                                 break;
6065                         /*FALLTHROUGH*/
6066                 case DIF_OP_LDSW:
6067                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
6068                         break;
6069                 case DIF_OP_RLDUB:
6070                         if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6071                                 break;
6072                         /*FALLTHROUGH*/
6073                 case DIF_OP_LDUB:
6074                         regs[rd] = dtrace_load8(regs[r1]);
6075                         break;
6076                 case DIF_OP_RLDUH:
6077                         if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6078                                 break;
6079                         /*FALLTHROUGH*/
6080                 case DIF_OP_LDUH:
6081                         regs[rd] = dtrace_load16(regs[r1]);
6082                         break;
6083                 case DIF_OP_RLDUW:
6084                         if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6085                                 break;
6086                         /*FALLTHROUGH*/
6087                 case DIF_OP_LDUW:
6088                         regs[rd] = dtrace_load32(regs[r1]);
6089                         break;
6090                 case DIF_OP_RLDX:
6091                         if (!dtrace_canload(regs[r1], 8, mstate, vstate))
6092                                 break;
6093                         /*FALLTHROUGH*/
6094                 case DIF_OP_LDX:
6095                         regs[rd] = dtrace_load64(regs[r1]);
6096                         break;
6097                 case DIF_OP_ULDSB:
6098                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6099                         regs[rd] = (int8_t)
6100                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6101                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6102                         break;
6103                 case DIF_OP_ULDSH:
6104                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6105                         regs[rd] = (int16_t)
6106                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6107                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6108                         break;
6109                 case DIF_OP_ULDSW:
6110                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6111                         regs[rd] = (int32_t)
6112                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6113                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6114                         break;
6115                 case DIF_OP_ULDUB:
6116                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6117                         regs[rd] =
6118                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6119                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6120                         break;
6121                 case DIF_OP_ULDUH:
6122                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6123                         regs[rd] =
6124                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6125                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6126                         break;
6127                 case DIF_OP_ULDUW:
6128                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6129                         regs[rd] =
6130                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6131                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6132                         break;
6133                 case DIF_OP_ULDX:
6134                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6135                         regs[rd] =
6136                             dtrace_fuword64((void *)(uintptr_t)regs[r1]);
6137                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6138                         break;
6139                 case DIF_OP_RET:
6140                         rval = regs[rd];
6141                         pc = textlen;
6142                         break;
6143                 case DIF_OP_NOP:
6144                         break;
6145                 case DIF_OP_SETX:
6146                         regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6147                         break;
6148                 case DIF_OP_SETS:
6149                         regs[rd] = (uint64_t)(uintptr_t)
6150                             (strtab + DIF_INSTR_STRING(instr));
6151                         break;
6152                 case DIF_OP_SCMP: {
6153                         size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6154                         uintptr_t s1 = regs[r1];
6155                         uintptr_t s2 = regs[r2];
6156
6157                         if (s1 != 0 &&
6158                             !dtrace_strcanload(s1, sz, mstate, vstate))
6159                                 break;
6160                         if (s2 != 0 &&
6161                             !dtrace_strcanload(s2, sz, mstate, vstate))
6162                                 break;
6163
6164                         cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
6165
6166                         cc_n = cc_r < 0;
6167                         cc_z = cc_r == 0;
6168                         cc_v = cc_c = 0;
6169                         break;
6170                 }
6171                 case DIF_OP_LDGA:
6172                         regs[rd] = dtrace_dif_variable(mstate, state,
6173                             r1, regs[r2]);
6174                         break;
6175                 case DIF_OP_LDGS:
6176                         id = DIF_INSTR_VAR(instr);
6177
6178                         if (id >= DIF_VAR_OTHER_UBASE) {
6179                                 uintptr_t a;
6180
6181                                 id -= DIF_VAR_OTHER_UBASE;
6182                                 svar = vstate->dtvs_globals[id];
6183                                 ASSERT(svar != NULL);
6184                                 v = &svar->dtsv_var;
6185
6186                                 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6187                                         regs[rd] = svar->dtsv_data;
6188                                         break;
6189                                 }
6190
6191                                 a = (uintptr_t)svar->dtsv_data;
6192
6193                                 if (*(uint8_t *)a == UINT8_MAX) {
6194                                         /*
6195                                          * If the 0th byte is set to UINT8_MAX
6196                                          * then this is to be treated as a
6197                                          * reference to a NULL variable.
6198                                          */
6199                                         regs[rd] = 0;
6200                                 } else {
6201                                         regs[rd] = a + sizeof (uint64_t);
6202                                 }
6203
6204                                 break;
6205                         }
6206
6207                         regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6208                         break;
6209
6210                 case DIF_OP_STGS:
6211                         id = DIF_INSTR_VAR(instr);
6212
6213                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6214                         id -= DIF_VAR_OTHER_UBASE;
6215
6216                         svar = vstate->dtvs_globals[id];
6217                         ASSERT(svar != NULL);
6218                         v = &svar->dtsv_var;
6219
6220                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6221                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6222
6223                                 ASSERT(a != 0);
6224                                 ASSERT(svar->dtsv_size != 0);
6225
6226                                 if (regs[rd] == 0) {
6227                                         *(uint8_t *)a = UINT8_MAX;
6228                                         break;
6229                                 } else {
6230                                         *(uint8_t *)a = 0;
6231                                         a += sizeof (uint64_t);
6232                                 }
6233                                 if (!dtrace_vcanload(
6234                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6235                                     mstate, vstate))
6236                                         break;
6237
6238                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6239                                     (void *)a, &v->dtdv_type);
6240                                 break;
6241                         }
6242
6243                         svar->dtsv_data = regs[rd];
6244                         break;
6245
6246                 case DIF_OP_LDTA:
6247                         /*
6248                          * There are no DTrace built-in thread-local arrays at
6249                          * present.  This opcode is saved for future work.
6250                          */
6251                         *flags |= CPU_DTRACE_ILLOP;
6252                         regs[rd] = 0;
6253                         break;
6254
6255                 case DIF_OP_LDLS:
6256                         id = DIF_INSTR_VAR(instr);
6257
6258                         if (id < DIF_VAR_OTHER_UBASE) {
6259                                 /*
6260                                  * For now, this has no meaning.
6261                                  */
6262                                 regs[rd] = 0;
6263                                 break;
6264                         }
6265
6266                         id -= DIF_VAR_OTHER_UBASE;
6267
6268                         ASSERT(id < vstate->dtvs_nlocals);
6269                         ASSERT(vstate->dtvs_locals != NULL);
6270
6271                         svar = vstate->dtvs_locals[id];
6272                         ASSERT(svar != NULL);
6273                         v = &svar->dtsv_var;
6274
6275                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6276                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6277                                 size_t sz = v->dtdv_type.dtdt_size;
6278
6279                                 sz += sizeof (uint64_t);
6280                                 ASSERT(svar->dtsv_size == NCPU * sz);
6281                                 a += curcpu * sz;
6282
6283                                 if (*(uint8_t *)a == UINT8_MAX) {
6284                                         /*
6285                                          * If the 0th byte is set to UINT8_MAX
6286                                          * then this is to be treated as a
6287                                          * reference to a NULL variable.
6288                                          */
6289                                         regs[rd] = 0;
6290                                 } else {
6291                                         regs[rd] = a + sizeof (uint64_t);
6292                                 }
6293
6294                                 break;
6295                         }
6296
6297                         ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6298                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6299                         regs[rd] = tmp[curcpu];
6300                         break;
6301
6302                 case DIF_OP_STLS:
6303                         id = DIF_INSTR_VAR(instr);
6304
6305                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6306                         id -= DIF_VAR_OTHER_UBASE;
6307                         ASSERT(id < vstate->dtvs_nlocals);
6308
6309                         ASSERT(vstate->dtvs_locals != NULL);
6310                         svar = vstate->dtvs_locals[id];
6311                         ASSERT(svar != NULL);
6312                         v = &svar->dtsv_var;
6313
6314                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6315                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6316                                 size_t sz = v->dtdv_type.dtdt_size;
6317
6318                                 sz += sizeof (uint64_t);
6319                                 ASSERT(svar->dtsv_size == NCPU * sz);
6320                                 a += curcpu * sz;
6321
6322                                 if (regs[rd] == 0) {
6323                                         *(uint8_t *)a = UINT8_MAX;
6324                                         break;
6325                                 } else {
6326                                         *(uint8_t *)a = 0;
6327                                         a += sizeof (uint64_t);
6328                                 }
6329
6330                                 if (!dtrace_vcanload(
6331                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6332                                     mstate, vstate))
6333                                         break;
6334
6335                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6336                                     (void *)a, &v->dtdv_type);
6337                                 break;
6338                         }
6339
6340                         ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6341                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6342                         tmp[curcpu] = regs[rd];
6343                         break;
6344
6345                 case DIF_OP_LDTS: {
6346                         dtrace_dynvar_t *dvar;
6347                         dtrace_key_t *key;
6348
6349                         id = DIF_INSTR_VAR(instr);
6350                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6351                         id -= DIF_VAR_OTHER_UBASE;
6352                         v = &vstate->dtvs_tlocals[id];
6353
6354                         key = &tupregs[DIF_DTR_NREGS];
6355                         key[0].dttk_value = (uint64_t)id;
6356                         key[0].dttk_size = 0;
6357                         DTRACE_TLS_THRKEY(key[1].dttk_value);
6358                         key[1].dttk_size = 0;
6359
6360                         dvar = dtrace_dynvar(dstate, 2, key,
6361                             sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6362                             mstate, vstate);
6363
6364                         if (dvar == NULL) {
6365                                 regs[rd] = 0;
6366                                 break;
6367                         }
6368
6369                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6370                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6371                         } else {
6372                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6373                         }
6374
6375                         break;
6376                 }
6377
6378                 case DIF_OP_STTS: {
6379                         dtrace_dynvar_t *dvar;
6380                         dtrace_key_t *key;
6381
6382                         id = DIF_INSTR_VAR(instr);
6383                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6384                         id -= DIF_VAR_OTHER_UBASE;
6385
6386                         key = &tupregs[DIF_DTR_NREGS];
6387                         key[0].dttk_value = (uint64_t)id;
6388                         key[0].dttk_size = 0;
6389                         DTRACE_TLS_THRKEY(key[1].dttk_value);
6390                         key[1].dttk_size = 0;
6391                         v = &vstate->dtvs_tlocals[id];
6392
6393                         dvar = dtrace_dynvar(dstate, 2, key,
6394                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6395                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6396                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
6397                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6398
6399                         /*
6400                          * Given that we're storing to thread-local data,
6401                          * we need to flush our predicate cache.
6402                          */
6403                         curthread->t_predcache = 0;
6404
6405                         if (dvar == NULL)
6406                                 break;
6407
6408                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6409                                 if (!dtrace_vcanload(
6410                                     (void *)(uintptr_t)regs[rd],
6411                                     &v->dtdv_type, mstate, vstate))
6412                                         break;
6413
6414                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6415                                     dvar->dtdv_data, &v->dtdv_type);
6416                         } else {
6417                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6418                         }
6419
6420                         break;
6421                 }
6422
6423                 case DIF_OP_SRA:
6424                         regs[rd] = (int64_t)regs[r1] >> regs[r2];
6425                         break;
6426
6427                 case DIF_OP_CALL:
6428                         dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6429                             regs, tupregs, ttop, mstate, state);
6430                         break;
6431
6432                 case DIF_OP_PUSHTR:
6433                         if (ttop == DIF_DTR_NREGS) {
6434                                 *flags |= CPU_DTRACE_TUPOFLOW;
6435                                 break;
6436                         }
6437
6438                         if (r1 == DIF_TYPE_STRING) {
6439                                 /*
6440                                  * If this is a string type and the size is 0,
6441                                  * we'll use the system-wide default string
6442                                  * size.  Note that we are _not_ looking at
6443                                  * the value of the DTRACEOPT_STRSIZE option;
6444                                  * had this been set, we would expect to have
6445                                  * a non-zero size value in the "pushtr".
6446                                  */
6447                                 tupregs[ttop].dttk_size =
6448                                     dtrace_strlen((char *)(uintptr_t)regs[rd],
6449                                     regs[r2] ? regs[r2] :
6450                                     dtrace_strsize_default) + 1;
6451                         } else {
6452                                 tupregs[ttop].dttk_size = regs[r2];
6453                         }
6454
6455                         tupregs[ttop++].dttk_value = regs[rd];
6456                         break;
6457
6458                 case DIF_OP_PUSHTV:
6459                         if (ttop == DIF_DTR_NREGS) {
6460                                 *flags |= CPU_DTRACE_TUPOFLOW;
6461                                 break;
6462                         }
6463
6464                         tupregs[ttop].dttk_value = regs[rd];
6465                         tupregs[ttop++].dttk_size = 0;
6466                         break;
6467
6468                 case DIF_OP_POPTS:
6469                         if (ttop != 0)
6470                                 ttop--;
6471                         break;
6472
6473                 case DIF_OP_FLUSHTS:
6474                         ttop = 0;
6475                         break;
6476
6477                 case DIF_OP_LDGAA:
6478                 case DIF_OP_LDTAA: {
6479                         dtrace_dynvar_t *dvar;
6480                         dtrace_key_t *key = tupregs;
6481                         uint_t nkeys = ttop;
6482
6483                         id = DIF_INSTR_VAR(instr);
6484                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6485                         id -= DIF_VAR_OTHER_UBASE;
6486
6487                         key[nkeys].dttk_value = (uint64_t)id;
6488                         key[nkeys++].dttk_size = 0;
6489
6490                         if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6491                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6492                                 key[nkeys++].dttk_size = 0;
6493                                 v = &vstate->dtvs_tlocals[id];
6494                         } else {
6495                                 v = &vstate->dtvs_globals[id]->dtsv_var;
6496                         }
6497
6498                         dvar = dtrace_dynvar(dstate, nkeys, key,
6499                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6500                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6501                             DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6502
6503                         if (dvar == NULL) {
6504                                 regs[rd] = 0;
6505                                 break;
6506                         }
6507
6508                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6509                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6510                         } else {
6511                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6512                         }
6513
6514                         break;
6515                 }
6516
6517                 case DIF_OP_STGAA:
6518                 case DIF_OP_STTAA: {
6519                         dtrace_dynvar_t *dvar;
6520                         dtrace_key_t *key = tupregs;
6521                         uint_t nkeys = ttop;
6522
6523                         id = DIF_INSTR_VAR(instr);
6524                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6525                         id -= DIF_VAR_OTHER_UBASE;
6526
6527                         key[nkeys].dttk_value = (uint64_t)id;
6528                         key[nkeys++].dttk_size = 0;
6529
6530                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6531                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6532                                 key[nkeys++].dttk_size = 0;
6533                                 v = &vstate->dtvs_tlocals[id];
6534                         } else {
6535                                 v = &vstate->dtvs_globals[id]->dtsv_var;
6536                         }
6537
6538                         dvar = dtrace_dynvar(dstate, nkeys, key,
6539                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6540                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6541                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
6542                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6543
6544                         if (dvar == NULL)
6545                                 break;
6546
6547                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6548                                 if (!dtrace_vcanload(
6549                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6550                                     mstate, vstate))
6551                                         break;
6552
6553                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6554                                     dvar->dtdv_data, &v->dtdv_type);
6555                         } else {
6556                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6557                         }
6558
6559                         break;
6560                 }
6561
6562                 case DIF_OP_ALLOCS: {
6563                         uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6564                         size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6565
6566                         /*
6567                          * Rounding up the user allocation size could have
6568                          * overflowed large, bogus allocations (like -1ULL) to
6569                          * 0.
6570                          */
6571                         if (size < regs[r1] ||
6572                             !DTRACE_INSCRATCH(mstate, size)) {
6573                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6574                                 regs[rd] = 0;
6575                                 break;
6576                         }
6577
6578                         dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6579                         mstate->dtms_scratch_ptr += size;
6580                         regs[rd] = ptr;
6581                         break;
6582                 }
6583
6584                 case DIF_OP_COPYS:
6585                         if (!dtrace_canstore(regs[rd], regs[r2],
6586                             mstate, vstate)) {
6587                                 *flags |= CPU_DTRACE_BADADDR;
6588                                 *illval = regs[rd];
6589                                 break;
6590                         }
6591
6592                         if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6593                                 break;
6594
6595                         dtrace_bcopy((void *)(uintptr_t)regs[r1],
6596                             (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6597                         break;
6598
6599                 case DIF_OP_STB:
6600                         if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6601                                 *flags |= CPU_DTRACE_BADADDR;
6602                                 *illval = regs[rd];
6603                                 break;
6604                         }
6605                         *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6606                         break;
6607
6608                 case DIF_OP_STH:
6609                         if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6610                                 *flags |= CPU_DTRACE_BADADDR;
6611                                 *illval = regs[rd];
6612                                 break;
6613                         }
6614                         if (regs[rd] & 1) {
6615                                 *flags |= CPU_DTRACE_BADALIGN;
6616                                 *illval = regs[rd];
6617                                 break;
6618                         }
6619                         *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6620                         break;
6621
6622                 case DIF_OP_STW:
6623                         if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6624                                 *flags |= CPU_DTRACE_BADADDR;
6625                                 *illval = regs[rd];
6626                                 break;
6627                         }
6628                         if (regs[rd] & 3) {
6629                                 *flags |= CPU_DTRACE_BADALIGN;
6630                                 *illval = regs[rd];
6631                                 break;
6632                         }
6633                         *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6634                         break;
6635
6636                 case DIF_OP_STX:
6637                         if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6638                                 *flags |= CPU_DTRACE_BADADDR;
6639                                 *illval = regs[rd];
6640                                 break;
6641                         }
6642                         if (regs[rd] & 7) {
6643                                 *flags |= CPU_DTRACE_BADALIGN;
6644                                 *illval = regs[rd];
6645                                 break;
6646                         }
6647                         *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6648                         break;
6649                 }
6650         }
6651
6652         if (!(*flags & CPU_DTRACE_FAULT))
6653                 return (rval);
6654
6655         mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6656         mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6657
6658         return (0);
6659 }
6660
6661 static void
6662 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6663 {
6664         dtrace_probe_t *probe = ecb->dte_probe;
6665         dtrace_provider_t *prov = probe->dtpr_provider;
6666         char c[DTRACE_FULLNAMELEN + 80], *str;
6667         char *msg = "dtrace: breakpoint action at probe ";
6668         char *ecbmsg = " (ecb ";
6669         uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6670         uintptr_t val = (uintptr_t)ecb;
6671         int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6672
6673         if (dtrace_destructive_disallow)
6674                 return;
6675
6676         /*
6677          * It's impossible to be taking action on the NULL probe.
6678          */
6679         ASSERT(probe != NULL);
6680
6681         /*
6682          * This is a poor man's (destitute man's?) sprintf():  we want to
6683          * print the provider name, module name, function name and name of
6684          * the probe, along with the hex address of the ECB with the breakpoint
6685          * action -- all of which we must place in the character buffer by
6686          * hand.
6687          */
6688         while (*msg != '\0')
6689                 c[i++] = *msg++;
6690
6691         for (str = prov->dtpv_name; *str != '\0'; str++)
6692                 c[i++] = *str;
6693         c[i++] = ':';
6694
6695         for (str = probe->dtpr_mod; *str != '\0'; str++)
6696                 c[i++] = *str;
6697         c[i++] = ':';
6698
6699         for (str = probe->dtpr_func; *str != '\0'; str++)
6700                 c[i++] = *str;
6701         c[i++] = ':';
6702
6703         for (str = probe->dtpr_name; *str != '\0'; str++)
6704                 c[i++] = *str;
6705
6706         while (*ecbmsg != '\0')
6707                 c[i++] = *ecbmsg++;
6708
6709         while (shift >= 0) {
6710                 mask = (uintptr_t)0xf << shift;
6711
6712                 if (val >= ((uintptr_t)1 << shift))
6713                         c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6714                 shift -= 4;
6715         }
6716
6717         c[i++] = ')';
6718         c[i] = '\0';
6719
6720 #if defined(sun)
6721         debug_enter(c);
6722 #else
6723         kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
6724 #endif
6725 }
6726
6727 static void
6728 dtrace_action_panic(dtrace_ecb_t *ecb)
6729 {
6730         dtrace_probe_t *probe = ecb->dte_probe;
6731
6732         /*
6733          * It's impossible to be taking action on the NULL probe.
6734          */
6735         ASSERT(probe != NULL);
6736
6737         if (dtrace_destructive_disallow)
6738                 return;
6739
6740         if (dtrace_panicked != NULL)
6741                 return;
6742
6743         if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6744                 return;
6745
6746         /*
6747          * We won the right to panic.  (We want to be sure that only one
6748          * thread calls panic() from dtrace_probe(), and that panic() is
6749          * called exactly once.)
6750          */
6751         dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6752             probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6753             probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6754 }
6755
6756 static void
6757 dtrace_action_raise(uint64_t sig)
6758 {
6759         if (dtrace_destructive_disallow)
6760                 return;
6761
6762         if (sig >= NSIG) {
6763                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6764                 return;
6765         }
6766
6767 #if defined(sun)
6768         /*
6769          * raise() has a queue depth of 1 -- we ignore all subsequent
6770          * invocations of the raise() action.
6771          */
6772         if (curthread->t_dtrace_sig == 0)
6773                 curthread->t_dtrace_sig = (uint8_t)sig;
6774
6775         curthread->t_sig_check = 1;
6776         aston(curthread);
6777 #else
6778         struct proc *p = curproc;
6779         PROC_LOCK(p);
6780         kern_psignal(p, sig);
6781         PROC_UNLOCK(p);
6782 #endif
6783 }
6784
6785 static void
6786 dtrace_action_stop(void)
6787 {
6788         if (dtrace_destructive_disallow)
6789                 return;
6790
6791 #if defined(sun)
6792         if (!curthread->t_dtrace_stop) {
6793                 curthread->t_dtrace_stop = 1;
6794                 curthread->t_sig_check = 1;
6795                 aston(curthread);
6796         }
6797 #else
6798         struct proc *p = curproc;
6799         PROC_LOCK(p);
6800         kern_psignal(p, SIGSTOP);
6801         PROC_UNLOCK(p);
6802 #endif
6803 }
6804
6805 static void
6806 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6807 {
6808         hrtime_t now;
6809         volatile uint16_t *flags;
6810 #if defined(sun)
6811         cpu_t *cpu = CPU;
6812 #else
6813         cpu_t *cpu = &solaris_cpu[curcpu];
6814 #endif
6815
6816         if (dtrace_destructive_disallow)
6817                 return;
6818
6819         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6820
6821         now = dtrace_gethrtime();
6822
6823         if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6824                 /*
6825                  * We need to advance the mark to the current time.
6826                  */
6827                 cpu->cpu_dtrace_chillmark = now;
6828                 cpu->cpu_dtrace_chilled = 0;
6829         }
6830
6831         /*
6832          * Now check to see if the requested chill time would take us over
6833          * the maximum amount of time allowed in the chill interval.  (Or
6834          * worse, if the calculation itself induces overflow.)
6835          */
6836         if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6837             cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6838                 *flags |= CPU_DTRACE_ILLOP;
6839                 return;
6840         }
6841
6842         while (dtrace_gethrtime() - now < val)
6843                 continue;
6844
6845         /*
6846          * Normally, we assure that the value of the variable "timestamp" does
6847          * not change within an ECB.  The presence of chill() represents an
6848          * exception to this rule, however.
6849          */
6850         mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6851         cpu->cpu_dtrace_chilled += val;
6852 }
6853
6854 static void
6855 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6856     uint64_t *buf, uint64_t arg)
6857 {
6858         int nframes = DTRACE_USTACK_NFRAMES(arg);
6859         int strsize = DTRACE_USTACK_STRSIZE(arg);
6860         uint64_t *pcs = &buf[1], *fps;
6861         char *str = (char *)&pcs[nframes];
6862         int size, offs = 0, i, j;
6863         uintptr_t old = mstate->dtms_scratch_ptr, saved;
6864         uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
6865         char *sym;
6866
6867         /*
6868          * Should be taking a faster path if string space has not been
6869          * allocated.
6870          */
6871         ASSERT(strsize != 0);
6872
6873         /*
6874          * We will first allocate some temporary space for the frame pointers.
6875          */
6876         fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6877         size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6878             (nframes * sizeof (uint64_t));
6879
6880         if (!DTRACE_INSCRATCH(mstate, size)) {
6881                 /*
6882                  * Not enough room for our frame pointers -- need to indicate
6883                  * that we ran out of scratch space.
6884                  */
6885                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6886                 return;
6887         }
6888
6889         mstate->dtms_scratch_ptr += size;
6890         saved = mstate->dtms_scratch_ptr;
6891
6892         /*
6893          * Now get a stack with both program counters and frame pointers.
6894          */
6895         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6896         dtrace_getufpstack(buf, fps, nframes + 1);
6897         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6898
6899         /*
6900          * If that faulted, we're cooked.
6901          */
6902         if (*flags & CPU_DTRACE_FAULT)
6903                 goto out;
6904
6905         /*
6906          * Now we want to walk up the stack, calling the USTACK helper.  For
6907          * each iteration, we restore the scratch pointer.
6908          */
6909         for (i = 0; i < nframes; i++) {
6910                 mstate->dtms_scratch_ptr = saved;
6911
6912                 if (offs >= strsize)
6913                         break;
6914
6915                 sym = (char *)(uintptr_t)dtrace_helper(
6916                     DTRACE_HELPER_ACTION_USTACK,
6917                     mstate, state, pcs[i], fps[i]);
6918
6919                 /*
6920                  * If we faulted while running the helper, we're going to
6921                  * clear the fault and null out the corresponding string.
6922                  */
6923                 if (*flags & CPU_DTRACE_FAULT) {
6924                         *flags &= ~CPU_DTRACE_FAULT;
6925                         str[offs++] = '\0';
6926                         continue;
6927                 }
6928
6929                 if (sym == NULL) {
6930                         str[offs++] = '\0';
6931                         continue;
6932                 }
6933
6934                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6935
6936                 /*
6937                  * Now copy in the string that the helper returned to us.
6938                  */
6939                 for (j = 0; offs + j < strsize; j++) {
6940                         if ((str[offs + j] = sym[j]) == '\0')
6941                                 break;
6942                 }
6943
6944                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6945
6946                 offs += j + 1;
6947         }
6948
6949         if (offs >= strsize) {
6950                 /*
6951                  * If we didn't have room for all of the strings, we don't
6952                  * abort processing -- this needn't be a fatal error -- but we
6953                  * still want to increment a counter (dts_stkstroverflows) to
6954                  * allow this condition to be warned about.  (If this is from
6955                  * a jstack() action, it is easily tuned via jstackstrsize.)
6956                  */
6957                 dtrace_error(&state->dts_stkstroverflows);
6958         }
6959
6960         while (offs < strsize)
6961                 str[offs++] = '\0';
6962
6963 out:
6964         mstate->dtms_scratch_ptr = old;
6965 }
6966
6967 static void
6968 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6969     size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6970 {
6971         volatile uint16_t *flags;
6972         uint64_t val = *valp;
6973         size_t valoffs = *valoffsp;
6974
6975         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6976         ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6977
6978         /*
6979          * If this is a string, we're going to only load until we find the zero
6980          * byte -- after which we'll store zero bytes.
6981          */
6982         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6983                 char c = '\0' + 1;
6984                 size_t s;
6985
6986                 for (s = 0; s < size; s++) {
6987                         if (c != '\0' && dtkind == DIF_TF_BYREF) {
6988                                 c = dtrace_load8(val++);
6989                         } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6990                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6991                                 c = dtrace_fuword8((void *)(uintptr_t)val++);
6992                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6993                                 if (*flags & CPU_DTRACE_FAULT)
6994                                         break;
6995                         }
6996
6997                         DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6998
6999                         if (c == '\0' && intuple)
7000                                 break;
7001                 }
7002         } else {
7003                 uint8_t c;
7004                 while (valoffs < end) {
7005                         if (dtkind == DIF_TF_BYREF) {
7006                                 c = dtrace_load8(val++);
7007                         } else if (dtkind == DIF_TF_BYUREF) {
7008                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7009                                 c = dtrace_fuword8((void *)(uintptr_t)val++);
7010                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7011                                 if (*flags & CPU_DTRACE_FAULT)
7012                                         break;
7013                         }
7014
7015                         DTRACE_STORE(uint8_t, tomax,
7016                             valoffs++, c);
7017                 }
7018         }
7019
7020         *valp = val;
7021         *valoffsp = valoffs;
7022 }
7023
7024 /*
7025  * If you're looking for the epicenter of DTrace, you just found it.  This
7026  * is the function called by the provider to fire a probe -- from which all
7027  * subsequent probe-context DTrace activity emanates.
7028  */
7029 void
7030 dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
7031     uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
7032 {
7033         processorid_t cpuid;
7034         dtrace_icookie_t cookie;
7035         dtrace_probe_t *probe;
7036         dtrace_mstate_t mstate;
7037         dtrace_ecb_t *ecb;
7038         dtrace_action_t *act;
7039         intptr_t offs;
7040         size_t size;
7041         int vtime, onintr;
7042         volatile uint16_t *flags;
7043         hrtime_t now;
7044
7045         if (panicstr != NULL)
7046                 return;
7047
7048 #if defined(sun)
7049         /*
7050          * Kick out immediately if this CPU is still being born (in which case
7051          * curthread will be set to -1) or the current thread can't allow
7052          * probes in its current context.
7053          */
7054         if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
7055                 return;
7056 #endif
7057
7058         cookie = dtrace_interrupt_disable();
7059         probe = dtrace_probes[id - 1];
7060         cpuid = curcpu;
7061         onintr = CPU_ON_INTR(CPU);
7062
7063         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
7064             probe->dtpr_predcache == curthread->t_predcache) {
7065                 /*
7066                  * We have hit in the predicate cache; we know that
7067                  * this predicate would evaluate to be false.
7068                  */
7069                 dtrace_interrupt_enable(cookie);
7070                 return;
7071         }
7072
7073 #if defined(sun)
7074         if (panic_quiesce) {
7075 #else
7076         if (panicstr != NULL) {
7077 #endif
7078                 /*
7079                  * We don't trace anything if we're panicking.
7080                  */
7081                 dtrace_interrupt_enable(cookie);
7082                 return;
7083         }
7084
7085         now = dtrace_gethrtime();
7086         vtime = dtrace_vtime_references != 0;
7087
7088         if (vtime && curthread->t_dtrace_start)
7089                 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
7090
7091         mstate.dtms_difo = NULL;
7092         mstate.dtms_probe = probe;
7093         mstate.dtms_strtok = 0;
7094         mstate.dtms_arg[0] = arg0;
7095         mstate.dtms_arg[1] = arg1;
7096         mstate.dtms_arg[2] = arg2;
7097         mstate.dtms_arg[3] = arg3;
7098         mstate.dtms_arg[4] = arg4;
7099
7100         flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7101
7102         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7103                 dtrace_predicate_t *pred = ecb->dte_predicate;
7104                 dtrace_state_t *state = ecb->dte_state;
7105                 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7106                 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7107                 dtrace_vstate_t *vstate = &state->dts_vstate;
7108                 dtrace_provider_t *prov = probe->dtpr_provider;
7109                 uint64_t tracememsize = 0;
7110                 int committed = 0;
7111                 caddr_t tomax;
7112
7113                 /*
7114                  * A little subtlety with the following (seemingly innocuous)
7115                  * declaration of the automatic 'val':  by looking at the
7116                  * code, you might think that it could be declared in the
7117                  * action processing loop, below.  (That is, it's only used in
7118                  * the action processing loop.)  However, it must be declared
7119                  * out of that scope because in the case of DIF expression
7120                  * arguments to aggregating actions, one iteration of the
7121                  * action loop will use the last iteration's value.
7122                  */
7123                 uint64_t val = 0;
7124
7125                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7126                 mstate.dtms_getf = NULL;
7127
7128                 *flags &= ~CPU_DTRACE_ERROR;
7129
7130                 if (prov == dtrace_provider) {
7131                         /*
7132                          * If dtrace itself is the provider of this probe,
7133                          * we're only going to continue processing the ECB if
7134                          * arg0 (the dtrace_state_t) is equal to the ECB's
7135                          * creating state.  (This prevents disjoint consumers
7136                          * from seeing one another's metaprobes.)
7137                          */
7138                         if (arg0 != (uint64_t)(uintptr_t)state)
7139                                 continue;
7140                 }
7141
7142                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7143                         /*
7144                          * We're not currently active.  If our provider isn't
7145                          * the dtrace pseudo provider, we're not interested.
7146                          */
7147                         if (prov != dtrace_provider)
7148                                 continue;
7149
7150                         /*
7151                          * Now we must further check if we are in the BEGIN
7152                          * probe.  If we are, we will only continue processing
7153                          * if we're still in WARMUP -- if one BEGIN enabling
7154                          * has invoked the exit() action, we don't want to
7155                          * evaluate subsequent BEGIN enablings.
7156                          */
7157                         if (probe->dtpr_id == dtrace_probeid_begin &&
7158                             state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7159                                 ASSERT(state->dts_activity ==
7160                                     DTRACE_ACTIVITY_DRAINING);
7161                                 continue;
7162                         }
7163                 }
7164
7165                 if (ecb->dte_cond) {
7166                         /*
7167                          * If the dte_cond bits indicate that this
7168                          * consumer is only allowed to see user-mode firings
7169                          * of this probe, call the provider's dtps_usermode()
7170                          * entry point to check that the probe was fired
7171                          * while in a user context. Skip this ECB if that's
7172                          * not the case.
7173                          */
7174                         if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7175                             prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7176                             probe->dtpr_id, probe->dtpr_arg) == 0)
7177                                 continue;
7178
7179 #if defined(sun)
7180                         /*
7181                          * This is more subtle than it looks. We have to be
7182                          * absolutely certain that CRED() isn't going to
7183                          * change out from under us so it's only legit to
7184                          * examine that structure if we're in constrained
7185                          * situations. Currently, the only times we'll this
7186                          * check is if a non-super-user has enabled the
7187                          * profile or syscall providers -- providers that
7188                          * allow visibility of all processes. For the
7189                          * profile case, the check above will ensure that
7190                          * we're examining a user context.
7191                          */
7192                         if (ecb->dte_cond & DTRACE_COND_OWNER) {
7193                                 cred_t *cr;
7194                                 cred_t *s_cr =
7195                                     ecb->dte_state->dts_cred.dcr_cred;
7196                                 proc_t *proc;
7197
7198                                 ASSERT(s_cr != NULL);
7199
7200                                 if ((cr = CRED()) == NULL ||
7201                                     s_cr->cr_uid != cr->cr_uid ||
7202                                     s_cr->cr_uid != cr->cr_ruid ||
7203                                     s_cr->cr_uid != cr->cr_suid ||
7204                                     s_cr->cr_gid != cr->cr_gid ||
7205                                     s_cr->cr_gid != cr->cr_rgid ||
7206                                     s_cr->cr_gid != cr->cr_sgid ||
7207                                     (proc = ttoproc(curthread)) == NULL ||
7208                                     (proc->p_flag & SNOCD))
7209                                         continue;
7210                         }
7211
7212                         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7213                                 cred_t *cr;
7214                                 cred_t *s_cr =
7215                                     ecb->dte_state->dts_cred.dcr_cred;
7216
7217                                 ASSERT(s_cr != NULL);
7218
7219                                 if ((cr = CRED()) == NULL ||
7220                                     s_cr->cr_zone->zone_id !=
7221                                     cr->cr_zone->zone_id)
7222                                         continue;
7223                         }
7224 #endif
7225                 }
7226
7227                 if (now - state->dts_alive > dtrace_deadman_timeout) {
7228                         /*
7229                          * We seem to be dead.  Unless we (a) have kernel
7230                          * destructive permissions (b) have explicitly enabled
7231                          * destructive actions and (c) destructive actions have
7232                          * not been disabled, we're going to transition into
7233                          * the KILLED state, from which no further processing
7234                          * on this state will be performed.
7235                          */
7236                         if (!dtrace_priv_kernel_destructive(state) ||
7237                             !state->dts_cred.dcr_destructive ||
7238                             dtrace_destructive_disallow) {
7239                                 void *activity = &state->dts_activity;
7240                                 dtrace_activity_t current;
7241
7242                                 do {
7243                                         current = state->dts_activity;
7244                                 } while (dtrace_cas32(activity, current,
7245                                     DTRACE_ACTIVITY_KILLED) != current);
7246
7247                                 continue;
7248                         }
7249                 }
7250
7251                 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7252                     ecb->dte_alignment, state, &mstate)) < 0)
7253                         continue;
7254
7255                 tomax = buf->dtb_tomax;
7256                 ASSERT(tomax != NULL);
7257
7258                 if (ecb->dte_size != 0) {
7259                         dtrace_rechdr_t dtrh;
7260                         if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7261                                 mstate.dtms_timestamp = dtrace_gethrtime();
7262                                 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7263                         }
7264                         ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
7265                         dtrh.dtrh_epid = ecb->dte_epid;
7266                         DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
7267                             mstate.dtms_timestamp);
7268                         *((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
7269                 }
7270
7271                 mstate.dtms_epid = ecb->dte_epid;
7272                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
7273
7274                 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7275                         mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7276                 else
7277                         mstate.dtms_access = 0;
7278
7279                 if (pred != NULL) {
7280                         dtrace_difo_t *dp = pred->dtp_difo;
7281                         int rval;
7282
7283                         rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7284
7285                         if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7286                                 dtrace_cacheid_t cid = probe->dtpr_predcache;
7287
7288                                 if (cid != DTRACE_CACHEIDNONE && !onintr) {
7289                                         /*
7290                                          * Update the predicate cache...
7291                                          */
7292                                         ASSERT(cid == pred->dtp_cacheid);
7293                                         curthread->t_predcache = cid;
7294                                 }
7295
7296                                 continue;
7297                         }
7298                 }
7299
7300                 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7301                     act != NULL; act = act->dta_next) {
7302                         size_t valoffs;
7303                         dtrace_difo_t *dp;
7304                         dtrace_recdesc_t *rec = &act->dta_rec;
7305
7306                         size = rec->dtrd_size;
7307                         valoffs = offs + rec->dtrd_offset;
7308
7309                         if (DTRACEACT_ISAGG(act->dta_kind)) {
7310                                 uint64_t v = 0xbad;
7311                                 dtrace_aggregation_t *agg;
7312
7313                                 agg = (dtrace_aggregation_t *)act;
7314
7315                                 if ((dp = act->dta_difo) != NULL)
7316                                         v = dtrace_dif_emulate(dp,
7317                                             &mstate, vstate, state);
7318
7319                                 if (*flags & CPU_DTRACE_ERROR)
7320                                         continue;
7321
7322                                 /*
7323                                  * Note that we always pass the expression
7324                                  * value from the previous iteration of the
7325                                  * action loop.  This value will only be used
7326                                  * if there is an expression argument to the
7327                                  * aggregating action, denoted by the
7328                                  * dtag_hasarg field.
7329                                  */
7330                                 dtrace_aggregate(agg, buf,
7331                                     offs, aggbuf, v, val);
7332                                 continue;
7333                         }
7334
7335                         switch (act->dta_kind) {
7336                         case DTRACEACT_STOP:
7337                                 if (dtrace_priv_proc_destructive(state))
7338                                         dtrace_action_stop();
7339                                 continue;
7340
7341                         case DTRACEACT_BREAKPOINT:
7342                                 if (dtrace_priv_kernel_destructive(state))
7343                                         dtrace_action_breakpoint(ecb);
7344                                 continue;
7345
7346                         case DTRACEACT_PANIC:
7347                                 if (dtrace_priv_kernel_destructive(state))
7348                                         dtrace_action_panic(ecb);
7349                                 continue;
7350
7351                         case DTRACEACT_STACK:
7352                                 if (!dtrace_priv_kernel(state))
7353                                         continue;
7354
7355                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
7356                                     size / sizeof (pc_t), probe->dtpr_aframes,
7357                                     DTRACE_ANCHORED(probe) ? NULL :
7358                                     (uint32_t *)arg0);
7359                                 continue;
7360
7361                         case DTRACEACT_JSTACK:
7362                         case DTRACEACT_USTACK:
7363                                 if (!dtrace_priv_proc(state))
7364                                         continue;
7365
7366                                 /*
7367                                  * See comment in DIF_VAR_PID.
7368                                  */
7369                                 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7370                                     CPU_ON_INTR(CPU)) {
7371                                         int depth = DTRACE_USTACK_NFRAMES(
7372                                             rec->dtrd_arg) + 1;
7373
7374                                         dtrace_bzero((void *)(tomax + valoffs),
7375                                             DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7376                                             + depth * sizeof (uint64_t));
7377
7378                                         continue;
7379                                 }
7380
7381                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7382                                     curproc->p_dtrace_helpers != NULL) {
7383                                         /*
7384                                          * This is the slow path -- we have
7385                                          * allocated string space, and we're
7386                                          * getting the stack of a process that
7387                                          * has helpers.  Call into a separate
7388                                          * routine to perform this processing.
7389                                          */
7390                                         dtrace_action_ustack(&mstate, state,
7391                                             (uint64_t *)(tomax + valoffs),
7392                                             rec->dtrd_arg);
7393                                         continue;
7394                                 }
7395
7396                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7397                                 dtrace_getupcstack((uint64_t *)
7398                                     (tomax + valoffs),
7399                                     DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7400                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7401                                 continue;
7402
7403                         default:
7404                                 break;
7405                         }
7406
7407                         dp = act->dta_difo;
7408                         ASSERT(dp != NULL);
7409
7410                         val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7411
7412                         if (*flags & CPU_DTRACE_ERROR)
7413                                 continue;
7414
7415                         switch (act->dta_kind) {
7416                         case DTRACEACT_SPECULATE: {
7417                                 dtrace_rechdr_t *dtrh;
7418
7419                                 ASSERT(buf == &state->dts_buffer[cpuid]);
7420                                 buf = dtrace_speculation_buffer(state,
7421                                     cpuid, val);
7422
7423                                 if (buf == NULL) {
7424                                         *flags |= CPU_DTRACE_DROP;
7425                                         continue;
7426                                 }
7427
7428                                 offs = dtrace_buffer_reserve(buf,
7429                                     ecb->dte_needed, ecb->dte_alignment,
7430                                     state, NULL);
7431
7432                                 if (offs < 0) {
7433                                         *flags |= CPU_DTRACE_DROP;
7434                                         continue;
7435                                 }
7436
7437                                 tomax = buf->dtb_tomax;
7438                                 ASSERT(tomax != NULL);
7439
7440                                 if (ecb->dte_size == 0)
7441                                         continue;
7442
7443                                 ASSERT3U(ecb->dte_size, >=,
7444                                     sizeof (dtrace_rechdr_t));
7445                                 dtrh = ((void *)(tomax + offs));
7446                                 dtrh->dtrh_epid = ecb->dte_epid;
7447                                 /*
7448                                  * When the speculation is committed, all of
7449                                  * the records in the speculative buffer will
7450                                  * have their timestamps set to the commit
7451                                  * time.  Until then, it is set to a sentinel
7452                                  * value, for debugability.
7453                                  */
7454                                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7455                                 continue;
7456                         }
7457
7458                         case DTRACEACT_PRINTM: {
7459                                 /* The DIF returns a 'memref'. */
7460                                 uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
7461
7462                                 /* Get the size from the memref. */
7463                                 size = memref[1];
7464
7465                                 /*
7466                                  * Check if the size exceeds the allocated
7467                                  * buffer size.
7468                                  */
7469                                 if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7470                                         /* Flag a drop! */
7471                                         *flags |= CPU_DTRACE_DROP;
7472                                         continue;
7473                                 }
7474
7475                                 /* Store the size in the buffer first. */
7476                                 DTRACE_STORE(uintptr_t, tomax,
7477                                     valoffs, size);
7478
7479                                 /*
7480                                  * Offset the buffer address to the start
7481                                  * of the data.
7482                                  */
7483                                 valoffs += sizeof(uintptr_t);
7484
7485                                 /*
7486                                  * Reset to the memory address rather than
7487                                  * the memref array, then let the BYREF
7488                                  * code below do the work to store the
7489                                  * memory data in the buffer.
7490                                  */
7491                                 val = memref[0];
7492                                 break;
7493                         }
7494
7495                         case DTRACEACT_PRINTT: {
7496                                 /* The DIF returns a 'typeref'. */
7497                                 uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
7498                                 char c = '\0' + 1;
7499                                 size_t s;
7500
7501                                 /*
7502                                  * Get the type string length and round it
7503                                  * up so that the data that follows is
7504                                  * aligned for easy access.
7505                                  */
7506                                 size_t typs = strlen((char *) typeref[2]) + 1;
7507                                 typs = roundup(typs,  sizeof(uintptr_t));
7508
7509                                 /*
7510                                  *Get the size from the typeref using the
7511                                  * number of elements and the type size.
7512                                  */
7513                                 size = typeref[1] * typeref[3];
7514
7515                                 /*
7516                                  * Check if the size exceeds the allocated
7517                                  * buffer size.
7518                                  */
7519                                 if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7520                                         /* Flag a drop! */
7521                                         *flags |= CPU_DTRACE_DROP;
7522
7523                                 }
7524
7525                                 /* Store the size in the buffer first. */
7526                                 DTRACE_STORE(uintptr_t, tomax,
7527                                     valoffs, size);
7528                                 valoffs += sizeof(uintptr_t);
7529
7530                                 /* Store the type size in the buffer. */
7531                                 DTRACE_STORE(uintptr_t, tomax,
7532                                     valoffs, typeref[3]);
7533                                 valoffs += sizeof(uintptr_t);
7534
7535                                 val = typeref[2];
7536
7537                                 for (s = 0; s < typs; s++) {
7538                                         if (c != '\0')
7539                                                 c = dtrace_load8(val++);
7540
7541                                         DTRACE_STORE(uint8_t, tomax,
7542                                             valoffs++, c);
7543                                 }
7544
7545                                 /*
7546                                  * Reset to the memory address rather than
7547                                  * the typeref array, then let the BYREF
7548                                  * code below do the work to store the
7549                                  * memory data in the buffer.
7550                                  */
7551                                 val = typeref[0];
7552                                 break;
7553                         }
7554
7555                         case DTRACEACT_CHILL:
7556                                 if (dtrace_priv_kernel_destructive(state))
7557                                         dtrace_action_chill(&mstate, val);
7558                                 continue;
7559
7560                         case DTRACEACT_RAISE:
7561                                 if (dtrace_priv_proc_destructive(state))
7562                                         dtrace_action_raise(val);
7563                                 continue;
7564
7565                         case DTRACEACT_COMMIT:
7566                                 ASSERT(!committed);
7567
7568                                 /*
7569                                  * We need to commit our buffer state.
7570                                  */
7571                                 if (ecb->dte_size)
7572                                         buf->dtb_offset = offs + ecb->dte_size;
7573                                 buf = &state->dts_buffer[cpuid];
7574                                 dtrace_speculation_commit(state, cpuid, val);
7575                                 committed = 1;
7576                                 continue;
7577
7578                         case DTRACEACT_DISCARD:
7579                                 dtrace_speculation_discard(state, cpuid, val);
7580                                 continue;
7581
7582                         case DTRACEACT_DIFEXPR:
7583                         case DTRACEACT_LIBACT:
7584                         case DTRACEACT_PRINTF:
7585                         case DTRACEACT_PRINTA:
7586                         case DTRACEACT_SYSTEM:
7587                         case DTRACEACT_FREOPEN:
7588                         case DTRACEACT_TRACEMEM:
7589                                 break;
7590
7591                         case DTRACEACT_TRACEMEM_DYNSIZE:
7592                                 tracememsize = val;
7593                                 break;
7594
7595                         case DTRACEACT_SYM:
7596                         case DTRACEACT_MOD:
7597                                 if (!dtrace_priv_kernel(state))
7598                                         continue;
7599                                 break;
7600
7601                         case DTRACEACT_USYM:
7602                         case DTRACEACT_UMOD:
7603                         case DTRACEACT_UADDR: {
7604 #if defined(sun)
7605                                 struct pid *pid = curthread->t_procp->p_pidp;
7606 #endif
7607
7608                                 if (!dtrace_priv_proc(state))
7609                                         continue;
7610
7611                                 DTRACE_STORE(uint64_t, tomax,
7612 #if defined(sun)
7613                                     valoffs, (uint64_t)pid->pid_id);
7614 #else
7615                                     valoffs, (uint64_t) curproc->p_pid);
7616 #endif
7617                                 DTRACE_STORE(uint64_t, tomax,
7618                                     valoffs + sizeof (uint64_t), val);
7619
7620                                 continue;
7621                         }
7622
7623                         case DTRACEACT_EXIT: {
7624                                 /*
7625                                  * For the exit action, we are going to attempt
7626                                  * to atomically set our activity to be
7627                                  * draining.  If this fails (either because
7628                                  * another CPU has beat us to the exit action,
7629                                  * or because our current activity is something
7630                                  * other than ACTIVE or WARMUP), we will
7631                                  * continue.  This assures that the exit action
7632                                  * can be successfully recorded at most once
7633                                  * when we're in the ACTIVE state.  If we're
7634                                  * encountering the exit() action while in
7635                                  * COOLDOWN, however, we want to honor the new
7636                                  * status code.  (We know that we're the only
7637                                  * thread in COOLDOWN, so there is no race.)
7638                                  */
7639                                 void *activity = &state->dts_activity;
7640                                 dtrace_activity_t current = state->dts_activity;
7641
7642                                 if (current == DTRACE_ACTIVITY_COOLDOWN)
7643                                         break;
7644
7645                                 if (current != DTRACE_ACTIVITY_WARMUP)
7646                                         current = DTRACE_ACTIVITY_ACTIVE;
7647
7648                                 if (dtrace_cas32(activity, current,
7649                                     DTRACE_ACTIVITY_DRAINING) != current) {
7650                                         *flags |= CPU_DTRACE_DROP;
7651                                         continue;
7652                                 }
7653
7654                                 break;
7655                         }
7656
7657                         default:
7658                                 ASSERT(0);
7659                         }
7660
7661                         if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
7662                             dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
7663                                 uintptr_t end = valoffs + size;
7664
7665                                 if (tracememsize != 0 &&
7666                                     valoffs + tracememsize < end) {
7667                                         end = valoffs + tracememsize;
7668                                         tracememsize = 0;
7669                                 }
7670
7671                                 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7672                                     !dtrace_vcanload((void *)(uintptr_t)val,
7673                                     &dp->dtdo_rtype, &mstate, vstate))
7674                                         continue;
7675
7676                                 dtrace_store_by_ref(dp, tomax, size, &valoffs,
7677                                     &val, end, act->dta_intuple,
7678                                     dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7679                                     DIF_TF_BYREF: DIF_TF_BYUREF);
7680                                 continue;
7681                         }
7682
7683                         switch (size) {
7684                         case 0:
7685                                 break;
7686
7687                         case sizeof (uint8_t):
7688                                 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7689                                 break;
7690                         case sizeof (uint16_t):
7691                                 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7692                                 break;
7693                         case sizeof (uint32_t):
7694                                 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7695                                 break;
7696                         case sizeof (uint64_t):
7697                                 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7698                                 break;
7699                         default:
7700                                 /*
7701                                  * Any other size should have been returned by
7702                                  * reference, not by value.
7703                                  */
7704                                 ASSERT(0);
7705                                 break;
7706                         }
7707                 }
7708
7709                 if (*flags & CPU_DTRACE_DROP)
7710                         continue;
7711
7712                 if (*flags & CPU_DTRACE_FAULT) {
7713                         int ndx;
7714                         dtrace_action_t *err;
7715
7716                         buf->dtb_errors++;
7717
7718                         if (probe->dtpr_id == dtrace_probeid_error) {
7719                                 /*
7720                                  * There's nothing we can do -- we had an
7721                                  * error on the error probe.  We bump an
7722                                  * error counter to at least indicate that
7723                                  * this condition happened.
7724                                  */
7725                                 dtrace_error(&state->dts_dblerrors);
7726                                 continue;
7727                         }
7728
7729                         if (vtime) {
7730                                 /*
7731                                  * Before recursing on dtrace_probe(), we
7732                                  * need to explicitly clear out our start
7733                                  * time to prevent it from being accumulated
7734                                  * into t_dtrace_vtime.
7735                                  */
7736                                 curthread->t_dtrace_start = 0;
7737                         }
7738
7739                         /*
7740                          * Iterate over the actions to figure out which action
7741                          * we were processing when we experienced the error.
7742                          * Note that act points _past_ the faulting action; if
7743                          * act is ecb->dte_action, the fault was in the
7744                          * predicate, if it's ecb->dte_action->dta_next it's
7745                          * in action #1, and so on.
7746                          */
7747                         for (err = ecb->dte_action, ndx = 0;
7748                             err != act; err = err->dta_next, ndx++)
7749                                 continue;
7750
7751                         dtrace_probe_error(state, ecb->dte_epid, ndx,
7752                             (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7753                             mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7754                             cpu_core[cpuid].cpuc_dtrace_illval);
7755
7756                         continue;
7757                 }
7758
7759                 if (!committed)
7760                         buf->dtb_offset = offs + ecb->dte_size;
7761         }
7762
7763         if (vtime)
7764                 curthread->t_dtrace_start = dtrace_gethrtime();
7765
7766         dtrace_interrupt_enable(cookie);
7767 }
7768
7769 /*
7770  * DTrace Probe Hashing Functions
7771  *
7772  * The functions in this section (and indeed, the functions in remaining
7773  * sections) are not _called_ from probe context.  (Any exceptions to this are
7774  * marked with a "Note:".)  Rather, they are called from elsewhere in the
7775  * DTrace framework to look-up probes in, add probes to and remove probes from
7776  * the DTrace probe hashes.  (Each probe is hashed by each element of the
7777  * probe tuple -- allowing for fast lookups, regardless of what was
7778  * specified.)
7779  */
7780 static uint_t
7781 dtrace_hash_str(const char *p)
7782 {
7783         unsigned int g;
7784         uint_t hval = 0;
7785
7786         while (*p) {
7787                 hval = (hval << 4) + *p++;
7788                 if ((g = (hval & 0xf0000000)) != 0)
7789                         hval ^= g >> 24;
7790                 hval &= ~g;
7791         }
7792         return (hval);
7793 }
7794
7795 static dtrace_hash_t *
7796 dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
7797 {
7798         dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7799
7800         hash->dth_stroffs = stroffs;
7801         hash->dth_nextoffs = nextoffs;
7802         hash->dth_prevoffs = prevoffs;
7803
7804         hash->dth_size = 1;
7805         hash->dth_mask = hash->dth_size - 1;
7806
7807         hash->dth_tab = kmem_zalloc(hash->dth_size *
7808             sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7809
7810         return (hash);
7811 }
7812
7813 static void
7814 dtrace_hash_destroy(dtrace_hash_t *hash)
7815 {
7816 #ifdef DEBUG
7817         int i;
7818
7819         for (i = 0; i < hash->dth_size; i++)
7820                 ASSERT(hash->dth_tab[i] == NULL);
7821 #endif
7822
7823         kmem_free(hash->dth_tab,
7824             hash->dth_size * sizeof (dtrace_hashbucket_t *));
7825         kmem_free(hash, sizeof (dtrace_hash_t));
7826 }
7827
7828 static void
7829 dtrace_hash_resize(dtrace_hash_t *hash)
7830 {
7831         int size = hash->dth_size, i, ndx;
7832         int new_size = hash->dth_size << 1;
7833         int new_mask = new_size - 1;
7834         dtrace_hashbucket_t **new_tab, *bucket, *next;
7835
7836         ASSERT((new_size & new_mask) == 0);
7837
7838         new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7839
7840         for (i = 0; i < size; i++) {
7841                 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7842                         dtrace_probe_t *probe = bucket->dthb_chain;
7843
7844                         ASSERT(probe != NULL);
7845                         ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
7846
7847                         next = bucket->dthb_next;
7848                         bucket->dthb_next = new_tab[ndx];
7849                         new_tab[ndx] = bucket;
7850                 }
7851         }
7852
7853         kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7854         hash->dth_tab = new_tab;
7855         hash->dth_size = new_size;
7856         hash->dth_mask = new_mask;
7857 }
7858
7859 static void
7860 dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
7861 {
7862         int hashval = DTRACE_HASHSTR(hash, new);
7863         int ndx = hashval & hash->dth_mask;
7864         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7865         dtrace_probe_t **nextp, **prevp;
7866
7867         for (; bucket != NULL; bucket = bucket->dthb_next) {
7868                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7869                         goto add;
7870         }
7871
7872         if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7873                 dtrace_hash_resize(hash);
7874                 dtrace_hash_add(hash, new);
7875                 return;
7876         }
7877
7878         bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7879         bucket->dthb_next = hash->dth_tab[ndx];
7880         hash->dth_tab[ndx] = bucket;
7881         hash->dth_nbuckets++;
7882
7883 add:
7884         nextp = DTRACE_HASHNEXT(hash, new);
7885         ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7886         *nextp = bucket->dthb_chain;
7887
7888         if (bucket->dthb_chain != NULL) {
7889                 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7890                 ASSERT(*prevp == NULL);
7891                 *prevp = new;
7892         }
7893
7894         bucket->dthb_chain = new;
7895         bucket->dthb_len++;
7896 }
7897
7898 static dtrace_probe_t *
7899 dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
7900 {
7901         int hashval = DTRACE_HASHSTR(hash, template);
7902         int ndx = hashval & hash->dth_mask;
7903         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7904
7905         for (; bucket != NULL; bucket = bucket->dthb_next) {
7906                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7907                         return (bucket->dthb_chain);
7908         }
7909
7910         return (NULL);
7911 }
7912
7913 static int
7914 dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
7915 {
7916         int hashval = DTRACE_HASHSTR(hash, template);
7917         int ndx = hashval & hash->dth_mask;
7918         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7919
7920         for (; bucket != NULL; bucket = bucket->dthb_next) {
7921                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7922                         return (bucket->dthb_len);
7923         }
7924
7925         return (0);
7926 }
7927
7928 static void
7929 dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
7930 {
7931         int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
7932         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7933
7934         dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
7935         dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
7936
7937         /*
7938          * Find the bucket that we're removing this probe from.
7939          */
7940         for (; bucket != NULL; bucket = bucket->dthb_next) {
7941                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
7942                         break;
7943         }
7944
7945         ASSERT(bucket != NULL);
7946
7947         if (*prevp == NULL) {
7948                 if (*nextp == NULL) {
7949                         /*
7950                          * The removed probe was the only probe on this
7951                          * bucket; we need to remove the bucket.
7952                          */
7953                         dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7954
7955                         ASSERT(bucket->dthb_chain == probe);
7956                         ASSERT(b != NULL);
7957
7958                         if (b == bucket) {
7959                                 hash->dth_tab[ndx] = bucket->dthb_next;
7960                         } else {
7961                                 while (b->dthb_next != bucket)
7962                                         b = b->dthb_next;
7963                                 b->dthb_next = bucket->dthb_next;
7964                         }
7965
7966                         ASSERT(hash->dth_nbuckets > 0);
7967                         hash->dth_nbuckets--;
7968                         kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7969                         return;
7970                 }
7971
7972                 bucket->dthb_chain = *nextp;
7973         } else {
7974                 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7975         }
7976
7977         if (*nextp != NULL)
7978                 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7979 }
7980
7981 /*
7982  * DTrace Utility Functions
7983  *
7984  * These are random utility functions that are _not_ called from probe context.
7985  */
7986 static int
7987 dtrace_badattr(const dtrace_attribute_t *a)
7988 {
7989         return (a->dtat_name > DTRACE_STABILITY_MAX ||
7990             a->dtat_data > DTRACE_STABILITY_MAX ||
7991             a->dtat_class > DTRACE_CLASS_MAX);
7992 }
7993
7994 /*
7995  * Return a duplicate copy of a string.  If the specified string is NULL,
7996  * this function returns a zero-length string.
7997  */
7998 static char *
7999 dtrace_strdup(const char *str)
8000 {
8001         char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
8002
8003         if (str != NULL)
8004                 (void) strcpy(new, str);
8005
8006         return (new);
8007 }
8008
8009 #define DTRACE_ISALPHA(c)       \
8010         (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
8011
8012 static int
8013 dtrace_badname(const char *s)
8014 {
8015         char c;
8016
8017         if (s == NULL || (c = *s++) == '\0')
8018                 return (0);
8019
8020         if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
8021                 return (1);
8022
8023         while ((c = *s++) != '\0') {
8024                 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
8025                     c != '-' && c != '_' && c != '.' && c != '`')
8026                         return (1);
8027         }
8028
8029         return (0);
8030 }
8031
8032 static void
8033 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
8034 {
8035         uint32_t priv;
8036
8037 #if defined(sun)
8038         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
8039                 /*
8040                  * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
8041                  */
8042                 priv = DTRACE_PRIV_ALL;
8043         } else {
8044                 *uidp = crgetuid(cr);
8045                 *zoneidp = crgetzoneid(cr);
8046
8047                 priv = 0;
8048                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
8049                         priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
8050                 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8051                         priv |= DTRACE_PRIV_USER;
8052                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8053                         priv |= DTRACE_PRIV_PROC;
8054                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8055                         priv |= DTRACE_PRIV_OWNER;
8056                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8057                         priv |= DTRACE_PRIV_ZONEOWNER;
8058         }
8059 #else
8060         priv = DTRACE_PRIV_ALL;
8061 #endif
8062
8063         *privp = priv;
8064 }
8065
8066 #ifdef DTRACE_ERRDEBUG
8067 static void
8068 dtrace_errdebug(const char *str)
8069 {
8070         int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8071         int occupied = 0;
8072
8073         mutex_enter(&dtrace_errlock);
8074         dtrace_errlast = str;
8075         dtrace_errthread = curthread;
8076
8077         while (occupied++ < DTRACE_ERRHASHSZ) {
8078                 if (dtrace_errhash[hval].dter_msg == str) {
8079                         dtrace_errhash[hval].dter_count++;
8080                         goto out;
8081                 }
8082
8083                 if (dtrace_errhash[hval].dter_msg != NULL) {
8084                         hval = (hval + 1) % DTRACE_ERRHASHSZ;
8085                         continue;
8086                 }
8087
8088                 dtrace_errhash[hval].dter_msg = str;
8089                 dtrace_errhash[hval].dter_count = 1;
8090                 goto out;
8091         }
8092
8093         panic("dtrace: undersized error hash");
8094 out:
8095         mutex_exit(&dtrace_errlock);
8096 }
8097 #endif
8098
8099 /*
8100  * DTrace Matching Functions
8101  *
8102  * These functions are used to match groups of probes, given some elements of
8103  * a probe tuple, or some globbed expressions for elements of a probe tuple.
8104  */
8105 static int
8106 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8107     zoneid_t zoneid)
8108 {
8109         if (priv != DTRACE_PRIV_ALL) {
8110                 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8111                 uint32_t match = priv & ppriv;
8112
8113                 /*
8114                  * No PRIV_DTRACE_* privileges...
8115                  */
8116                 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8117                     DTRACE_PRIV_KERNEL)) == 0)
8118                         return (0);
8119
8120                 /*
8121                  * No matching bits, but there were bits to match...
8122                  */
8123                 if (match == 0 && ppriv != 0)
8124                         return (0);
8125
8126                 /*
8127                  * Need to have permissions to the process, but don't...
8128                  */
8129                 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8130                     uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8131                         return (0);
8132                 }
8133
8134                 /*
8135                  * Need to be in the same zone unless we possess the
8136                  * privilege to examine all zones.
8137                  */
8138                 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8139                     zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8140                         return (0);
8141                 }
8142         }
8143
8144         return (1);
8145 }
8146
8147 /*
8148  * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8149  * consists of input pattern strings and an ops-vector to evaluate them.
8150  * This function returns >0 for match, 0 for no match, and <0 for error.
8151  */
8152 static int
8153 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8154     uint32_t priv, uid_t uid, zoneid_t zoneid)
8155 {
8156         dtrace_provider_t *pvp = prp->dtpr_provider;
8157         int rv;
8158
8159         if (pvp->dtpv_defunct)
8160                 return (0);
8161
8162         if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8163                 return (rv);
8164
8165         if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8166                 return (rv);
8167
8168         if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8169                 return (rv);
8170
8171         if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8172                 return (rv);
8173
8174         if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8175                 return (0);
8176
8177         return (rv);
8178 }
8179
8180 /*
8181  * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8182  * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
8183  * libc's version, the kernel version only applies to 8-bit ASCII strings.
8184  * In addition, all of the recursion cases except for '*' matching have been
8185  * unwound.  For '*', we still implement recursive evaluation, but a depth
8186  * counter is maintained and matching is aborted if we recurse too deep.
8187  * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8188  */
8189 static int
8190 dtrace_match_glob(const char *s, const char *p, int depth)
8191 {
8192         const char *olds;
8193         char s1, c;
8194         int gs;
8195
8196         if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8197                 return (-1);
8198
8199         if (s == NULL)
8200                 s = ""; /* treat NULL as empty string */
8201
8202 top:
8203         olds = s;
8204         s1 = *s++;
8205
8206         if (p == NULL)
8207                 return (0);
8208
8209         if ((c = *p++) == '\0')
8210                 return (s1 == '\0');
8211
8212         switch (c) {
8213         case '[': {
8214                 int ok = 0, notflag = 0;
8215                 char lc = '\0';
8216
8217                 if (s1 == '\0')
8218                         return (0);
8219
8220                 if (*p == '!') {
8221                         notflag = 1;
8222                         p++;
8223                 }
8224
8225                 if ((c = *p++) == '\0')
8226                         return (0);
8227
8228                 do {
8229                         if (c == '-' && lc != '\0' && *p != ']') {
8230                                 if ((c = *p++) == '\0')
8231                                         return (0);
8232                                 if (c == '\\' && (c = *p++) == '\0')
8233                                         return (0);
8234
8235                                 if (notflag) {
8236                                         if (s1 < lc || s1 > c)
8237                                                 ok++;
8238                                         else
8239                                                 return (0);
8240                                 } else if (lc <= s1 && s1 <= c)
8241                                         ok++;
8242
8243                         } else if (c == '\\' && (c = *p++) == '\0')
8244                                 return (0);
8245
8246                         lc = c; /* save left-hand 'c' for next iteration */
8247
8248                         if (notflag) {
8249                                 if (s1 != c)
8250                                         ok++;
8251                                 else
8252                                         return (0);
8253                         } else if (s1 == c)
8254                                 ok++;
8255
8256                         if ((c = *p++) == '\0')
8257                                 return (0);
8258
8259                 } while (c != ']');
8260
8261                 if (ok)
8262                         goto top;
8263
8264                 return (0);
8265         }
8266
8267         case '\\':
8268                 if ((c = *p++) == '\0')
8269                         return (0);
8270                 /*FALLTHRU*/
8271
8272         default:
8273                 if (c != s1)
8274                         return (0);
8275                 /*FALLTHRU*/
8276
8277         case '?':
8278                 if (s1 != '\0')
8279                         goto top;
8280                 return (0);
8281
8282         case '*':
8283                 while (*p == '*')
8284                         p++; /* consecutive *'s are identical to a single one */
8285
8286                 if (*p == '\0')
8287                         return (1);
8288
8289                 for (s = olds; *s != '\0'; s++) {
8290                         if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8291                                 return (gs);
8292                 }
8293
8294                 return (0);
8295         }
8296 }
8297
8298 /*ARGSUSED*/
8299 static int
8300 dtrace_match_string(const char *s, const char *p, int depth)
8301 {
8302         return (s != NULL && strcmp(s, p) == 0);
8303 }
8304
8305 /*ARGSUSED*/
8306 static int
8307 dtrace_match_nul(const char *s, const char *p, int depth)
8308 {
8309         return (1); /* always match the empty pattern */
8310 }
8311
8312 /*ARGSUSED*/
8313 static int
8314 dtrace_match_nonzero(const char *s, const char *p, int depth)
8315 {
8316         return (s != NULL && s[0] != '\0');
8317 }
8318
8319 static int
8320 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8321     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
8322 {
8323         dtrace_probe_t template, *probe;
8324         dtrace_hash_t *hash = NULL;
8325         int len, best = INT_MAX, nmatched = 0;
8326         dtrace_id_t i;
8327
8328         ASSERT(MUTEX_HELD(&dtrace_lock));
8329
8330         /*
8331          * If the probe ID is specified in the key, just lookup by ID and
8332          * invoke the match callback once if a matching probe is found.
8333          */
8334         if (pkp->dtpk_id != DTRACE_IDNONE) {
8335                 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8336                     dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8337                         (void) (*matched)(probe, arg);
8338                         nmatched++;
8339                 }
8340                 return (nmatched);
8341         }
8342
8343         template.dtpr_mod = (char *)pkp->dtpk_mod;
8344         template.dtpr_func = (char *)pkp->dtpk_func;
8345         template.dtpr_name = (char *)pkp->dtpk_name;
8346
8347         /*
8348          * We want to find the most distinct of the module name, function
8349          * name, and name.  So for each one that is not a glob pattern or
8350          * empty string, we perform a lookup in the corresponding hash and
8351          * use the hash table with the fewest collisions to do our search.
8352          */
8353         if (pkp->dtpk_mmatch == &dtrace_match_string &&
8354             (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8355                 best = len;
8356                 hash = dtrace_bymod;
8357         }
8358
8359         if (pkp->dtpk_fmatch == &dtrace_match_string &&
8360             (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8361                 best = len;
8362                 hash = dtrace_byfunc;
8363         }
8364
8365         if (pkp->dtpk_nmatch == &dtrace_match_string &&
8366             (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8367                 best = len;
8368                 hash = dtrace_byname;
8369         }
8370
8371         /*
8372          * If we did not select a hash table, iterate over every probe and
8373          * invoke our callback for each one that matches our input probe key.
8374          */
8375         if (hash == NULL) {
8376                 for (i = 0; i < dtrace_nprobes; i++) {
8377                         if ((probe = dtrace_probes[i]) == NULL ||
8378                             dtrace_match_probe(probe, pkp, priv, uid,
8379                             zoneid) <= 0)
8380                                 continue;
8381
8382                         nmatched++;
8383
8384                         if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8385                                 break;
8386                 }
8387
8388                 return (nmatched);
8389         }
8390
8391         /*
8392          * If we selected a hash table, iterate over each probe of the same key
8393          * name and invoke the callback for every probe that matches the other
8394          * attributes of our input probe key.
8395          */
8396         for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8397             probe = *(DTRACE_HASHNEXT(hash, probe))) {
8398
8399                 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8400                         continue;
8401
8402                 nmatched++;
8403
8404                 if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8405                         break;
8406         }
8407
8408         return (nmatched);
8409 }
8410
8411 /*
8412  * Return the function pointer dtrace_probecmp() should use to compare the
8413  * specified pattern with a string.  For NULL or empty patterns, we select
8414  * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
8415  * For non-empty non-glob strings, we use dtrace_match_string().
8416  */
8417 static dtrace_probekey_f *
8418 dtrace_probekey_func(const char *p)
8419 {
8420         char c;
8421
8422         if (p == NULL || *p == '\0')
8423                 return (&dtrace_match_nul);
8424
8425         while ((c = *p++) != '\0') {
8426                 if (c == '[' || c == '?' || c == '*' || c == '\\')
8427                         return (&dtrace_match_glob);
8428         }
8429
8430         return (&dtrace_match_string);
8431 }
8432
8433 /*
8434  * Build a probe comparison key for use with dtrace_match_probe() from the
8435  * given probe description.  By convention, a null key only matches anchored
8436  * probes: if each field is the empty string, reset dtpk_fmatch to
8437  * dtrace_match_nonzero().
8438  */
8439 static void
8440 dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8441 {
8442         pkp->dtpk_prov = pdp->dtpd_provider;
8443         pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8444
8445         pkp->dtpk_mod = pdp->dtpd_mod;
8446         pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
8447
8448         pkp->dtpk_func = pdp->dtpd_func;
8449         pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8450
8451         pkp->dtpk_name = pdp->dtpd_name;
8452         pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8453
8454         pkp->dtpk_id = pdp->dtpd_id;
8455
8456         if (pkp->dtpk_id == DTRACE_IDNONE &&
8457             pkp->dtpk_pmatch == &dtrace_match_nul &&
8458             pkp->dtpk_mmatch == &dtrace_match_nul &&
8459             pkp->dtpk_fmatch == &dtrace_match_nul &&
8460             pkp->dtpk_nmatch == &dtrace_match_nul)
8461                 pkp->dtpk_fmatch = &dtrace_match_nonzero;
8462 }
8463
8464 /*
8465  * DTrace Provider-to-Framework API Functions
8466  *
8467  * These functions implement much of the Provider-to-Framework API, as
8468  * described in <sys/dtrace.h>.  The parts of the API not in this section are
8469  * the functions in the API for probe management (found below), and
8470  * dtrace_probe() itself (found above).
8471  */
8472
8473 /*
8474  * Register the calling provider with the DTrace framework.  This should
8475  * generally be called by DTrace providers in their attach(9E) entry point.
8476  */
8477 int
8478 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8479     cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8480 {
8481         dtrace_provider_t *provider;
8482
8483         if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8484                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8485                     "arguments", name ? name : "<NULL>");
8486                 return (EINVAL);
8487         }
8488
8489         if (name[0] == '\0' || dtrace_badname(name)) {
8490                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8491                     "provider name", name);
8492                 return (EINVAL);
8493         }
8494
8495         if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8496             pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8497             pops->dtps_destroy == NULL ||
8498             ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8499                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8500                     "provider ops", name);
8501                 return (EINVAL);
8502         }
8503
8504         if (dtrace_badattr(&pap->dtpa_provider) ||
8505             dtrace_badattr(&pap->dtpa_mod) ||
8506             dtrace_badattr(&pap->dtpa_func) ||
8507             dtrace_badattr(&pap->dtpa_name) ||
8508             dtrace_badattr(&pap->dtpa_args)) {
8509                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8510                     "provider attributes", name);
8511                 return (EINVAL);
8512         }
8513
8514         if (priv & ~DTRACE_PRIV_ALL) {
8515                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8516                     "privilege attributes", name);
8517                 return (EINVAL);
8518         }
8519
8520         if ((priv & DTRACE_PRIV_KERNEL) &&
8521             (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8522             pops->dtps_usermode == NULL) {
8523                 cmn_err(CE_WARN, "failed to register provider '%s': need "
8524                     "dtps_usermode() op for given privilege attributes", name);
8525                 return (EINVAL);
8526         }
8527
8528         provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8529         provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8530         (void) strcpy(provider->dtpv_name, name);
8531
8532         provider->dtpv_attr = *pap;
8533         provider->dtpv_priv.dtpp_flags = priv;
8534         if (cr != NULL) {
8535                 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8536                 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8537         }
8538         provider->dtpv_pops = *pops;
8539
8540         if (pops->dtps_provide == NULL) {
8541                 ASSERT(pops->dtps_provide_module != NULL);
8542                 provider->dtpv_pops.dtps_provide =
8543                     (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
8544         }
8545
8546         if (pops->dtps_provide_module == NULL) {
8547                 ASSERT(pops->dtps_provide != NULL);
8548                 provider->dtpv_pops.dtps_provide_module =
8549                     (void (*)(void *, modctl_t *))dtrace_nullop;
8550         }
8551
8552         if (pops->dtps_suspend == NULL) {
8553                 ASSERT(pops->dtps_resume == NULL);
8554                 provider->dtpv_pops.dtps_suspend =
8555                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8556                 provider->dtpv_pops.dtps_resume =
8557                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8558         }
8559
8560         provider->dtpv_arg = arg;
8561         *idp = (dtrace_provider_id_t)provider;
8562
8563         if (pops == &dtrace_provider_ops) {
8564                 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8565                 ASSERT(MUTEX_HELD(&dtrace_lock));
8566                 ASSERT(dtrace_anon.dta_enabling == NULL);
8567
8568                 /*
8569                  * We make sure that the DTrace provider is at the head of
8570                  * the provider chain.
8571                  */
8572                 provider->dtpv_next = dtrace_provider;
8573                 dtrace_provider = provider;
8574                 return (0);
8575         }
8576
8577         mutex_enter(&dtrace_provider_lock);
8578         mutex_enter(&dtrace_lock);
8579
8580         /*
8581          * If there is at least one provider registered, we'll add this
8582          * provider after the first provider.
8583          */
8584         if (dtrace_provider != NULL) {
8585                 provider->dtpv_next = dtrace_provider->dtpv_next;
8586                 dtrace_provider->dtpv_next = provider;
8587         } else {
8588                 dtrace_provider = provider;
8589         }
8590
8591         if (dtrace_retained != NULL) {
8592                 dtrace_enabling_provide(provider);
8593
8594                 /*
8595                  * Now we need to call dtrace_enabling_matchall() -- which
8596                  * will acquire cpu_lock and dtrace_lock.  We therefore need
8597                  * to drop all of our locks before calling into it...
8598                  */
8599                 mutex_exit(&dtrace_lock);
8600                 mutex_exit(&dtrace_provider_lock);
8601                 dtrace_enabling_matchall();
8602
8603                 return (0);
8604         }
8605
8606         mutex_exit(&dtrace_lock);
8607         mutex_exit(&dtrace_provider_lock);
8608
8609         return (0);
8610 }
8611
8612 /*
8613  * Unregister the specified provider from the DTrace framework.  This should
8614  * generally be called by DTrace providers in their detach(9E) entry point.
8615  */
8616 int
8617 dtrace_unregister(dtrace_provider_id_t id)
8618 {
8619         dtrace_provider_t *old = (dtrace_provider_t *)id;
8620         dtrace_provider_t *prev = NULL;
8621         int i, self = 0, noreap = 0;
8622         dtrace_probe_t *probe, *first = NULL;
8623
8624         if (old->dtpv_pops.dtps_enable ==
8625             (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
8626                 /*
8627                  * If DTrace itself is the provider, we're called with locks
8628                  * already held.
8629                  */
8630                 ASSERT(old == dtrace_provider);
8631 #if defined(sun)
8632                 ASSERT(dtrace_devi != NULL);
8633 #endif
8634                 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8635                 ASSERT(MUTEX_HELD(&dtrace_lock));
8636                 self = 1;
8637
8638                 if (dtrace_provider->dtpv_next != NULL) {
8639                         /*
8640                          * There's another provider here; return failure.
8641                          */
8642                         return (EBUSY);
8643                 }
8644         } else {
8645                 mutex_enter(&dtrace_provider_lock);
8646 #if defined(sun)
8647                 mutex_enter(&mod_lock);
8648 #endif
8649                 mutex_enter(&dtrace_lock);
8650         }
8651
8652         /*
8653          * If anyone has /dev/dtrace open, or if there are anonymous enabled
8654          * probes, we refuse to let providers slither away, unless this
8655          * provider has already been explicitly invalidated.
8656          */
8657         if (!old->dtpv_defunct &&
8658             (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8659             dtrace_anon.dta_state->dts_necbs > 0))) {
8660                 if (!self) {
8661                         mutex_exit(&dtrace_lock);
8662 #if defined(sun)
8663                         mutex_exit(&mod_lock);
8664 #endif
8665                         mutex_exit(&dtrace_provider_lock);
8666                 }
8667                 return (EBUSY);
8668         }
8669
8670         /*
8671          * Attempt to destroy the probes associated with this provider.
8672          */
8673         for (i = 0; i < dtrace_nprobes; i++) {
8674                 if ((probe = dtrace_probes[i]) == NULL)
8675                         continue;
8676
8677                 if (probe->dtpr_provider != old)
8678                         continue;
8679
8680                 if (probe->dtpr_ecb == NULL)
8681                         continue;
8682
8683                 /*
8684                  * If we are trying to unregister a defunct provider, and the
8685                  * provider was made defunct within the interval dictated by
8686                  * dtrace_unregister_defunct_reap, we'll (asynchronously)
8687                  * attempt to reap our enablings.  To denote that the provider
8688                  * should reattempt to unregister itself at some point in the
8689                  * future, we will return a differentiable error code (EAGAIN
8690                  * instead of EBUSY) in this case.
8691                  */
8692                 if (dtrace_gethrtime() - old->dtpv_defunct >
8693                     dtrace_unregister_defunct_reap)
8694                         noreap = 1;
8695
8696                 if (!self) {
8697                         mutex_exit(&dtrace_lock);
8698 #if defined(sun)
8699                         mutex_exit(&mod_lock);
8700 #endif
8701                         mutex_exit(&dtrace_provider_lock);
8702                 }
8703
8704                 if (noreap)
8705                         return (EBUSY);
8706
8707                 (void) taskq_dispatch(dtrace_taskq,
8708                     (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
8709
8710                 return (EAGAIN);
8711         }
8712
8713         /*
8714          * All of the probes for this provider are disabled; we can safely
8715          * remove all of them from their hash chains and from the probe array.
8716          */
8717         for (i = 0; i < dtrace_nprobes; i++) {
8718                 if ((probe = dtrace_probes[i]) == NULL)
8719                         continue;
8720
8721                 if (probe->dtpr_provider != old)
8722                         continue;
8723
8724                 dtrace_probes[i] = NULL;
8725
8726                 dtrace_hash_remove(dtrace_bymod, probe);
8727                 dtrace_hash_remove(dtrace_byfunc, probe);
8728                 dtrace_hash_remove(dtrace_byname, probe);
8729
8730                 if (first == NULL) {
8731                         first = probe;
8732                         probe->dtpr_nextmod = NULL;
8733                 } else {
8734                         probe->dtpr_nextmod = first;
8735                         first = probe;
8736                 }
8737         }
8738
8739         /*
8740          * The provider's probes have been removed from the hash chains and
8741          * from the probe array.  Now issue a dtrace_sync() to be sure that
8742          * everyone has cleared out from any probe array processing.
8743          */
8744         dtrace_sync();
8745
8746         for (probe = first; probe != NULL; probe = first) {
8747                 first = probe->dtpr_nextmod;
8748
8749                 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8750                     probe->dtpr_arg);
8751                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8752                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8753                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8754 #if defined(sun)
8755                 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8756 #else
8757                 free_unr(dtrace_arena, probe->dtpr_id);
8758 #endif
8759                 kmem_free(probe, sizeof (dtrace_probe_t));
8760         }
8761
8762         if ((prev = dtrace_provider) == old) {
8763 #if defined(sun)
8764                 ASSERT(self || dtrace_devi == NULL);
8765                 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8766 #endif
8767                 dtrace_provider = old->dtpv_next;
8768         } else {
8769                 while (prev != NULL && prev->dtpv_next != old)
8770                         prev = prev->dtpv_next;
8771
8772                 if (prev == NULL) {
8773                         panic("attempt to unregister non-existent "
8774                             "dtrace provider %p\n", (void *)id);
8775                 }
8776
8777                 prev->dtpv_next = old->dtpv_next;
8778         }
8779
8780         if (!self) {
8781                 mutex_exit(&dtrace_lock);
8782 #if defined(sun)
8783                 mutex_exit(&mod_lock);
8784 #endif
8785                 mutex_exit(&dtrace_provider_lock);
8786         }
8787
8788         kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
8789         kmem_free(old, sizeof (dtrace_provider_t));
8790
8791         return (0);
8792 }
8793
8794 /*
8795  * Invalidate the specified provider.  All subsequent probe lookups for the
8796  * specified provider will fail, but its probes will not be removed.
8797  */
8798 void
8799 dtrace_invalidate(dtrace_provider_id_t id)
8800 {
8801         dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8802
8803         ASSERT(pvp->dtpv_pops.dtps_enable !=
8804             (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8805
8806         mutex_enter(&dtrace_provider_lock);
8807         mutex_enter(&dtrace_lock);
8808
8809         pvp->dtpv_defunct = dtrace_gethrtime();
8810
8811         mutex_exit(&dtrace_lock);
8812         mutex_exit(&dtrace_provider_lock);
8813 }
8814
8815 /*
8816  * Indicate whether or not DTrace has attached.
8817  */
8818 int
8819 dtrace_attached(void)
8820 {
8821         /*
8822          * dtrace_provider will be non-NULL iff the DTrace driver has
8823          * attached.  (It's non-NULL because DTrace is always itself a
8824          * provider.)
8825          */
8826         return (dtrace_provider != NULL);
8827 }
8828
8829 /*
8830  * Remove all the unenabled probes for the given provider.  This function is
8831  * not unlike dtrace_unregister(), except that it doesn't remove the provider
8832  * -- just as many of its associated probes as it can.
8833  */
8834 int
8835 dtrace_condense(dtrace_provider_id_t id)
8836 {
8837         dtrace_provider_t *prov = (dtrace_provider_t *)id;
8838         int i;
8839         dtrace_probe_t *probe;
8840
8841         /*
8842          * Make sure this isn't the dtrace provider itself.
8843          */
8844         ASSERT(prov->dtpv_pops.dtps_enable !=
8845             (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8846
8847         mutex_enter(&dtrace_provider_lock);
8848         mutex_enter(&dtrace_lock);
8849
8850         /*
8851          * Attempt to destroy the probes associated with this provider.
8852          */
8853         for (i = 0; i < dtrace_nprobes; i++) {
8854                 if ((probe = dtrace_probes[i]) == NULL)
8855                         continue;
8856
8857                 if (probe->dtpr_provider != prov)
8858                         continue;
8859
8860                 if (probe->dtpr_ecb != NULL)
8861                         continue;
8862
8863                 dtrace_probes[i] = NULL;
8864
8865                 dtrace_hash_remove(dtrace_bymod, probe);
8866                 dtrace_hash_remove(dtrace_byfunc, probe);
8867                 dtrace_hash_remove(dtrace_byname, probe);
8868
8869                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
8870                     probe->dtpr_arg);
8871                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8872                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8873                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8874                 kmem_free(probe, sizeof (dtrace_probe_t));
8875 #if defined(sun)
8876                 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
8877 #else
8878                 free_unr(dtrace_arena, i + 1);
8879 #endif
8880         }
8881
8882         mutex_exit(&dtrace_lock);
8883         mutex_exit(&dtrace_provider_lock);
8884
8885         return (0);
8886 }
8887
8888 /*
8889  * DTrace Probe Management Functions
8890  *
8891  * The functions in this section perform the DTrace probe management,
8892  * including functions to create probes, look-up probes, and call into the
8893  * providers to request that probes be provided.  Some of these functions are
8894  * in the Provider-to-Framework API; these functions can be identified by the
8895  * fact that they are not declared "static".
8896  */
8897
8898 /*
8899  * Create a probe with the specified module name, function name, and name.
8900  */
8901 dtrace_id_t
8902 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8903     const char *func, const char *name, int aframes, void *arg)
8904 {
8905         dtrace_probe_t *probe, **probes;
8906         dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8907         dtrace_id_t id;
8908
8909         if (provider == dtrace_provider) {
8910                 ASSERT(MUTEX_HELD(&dtrace_lock));
8911         } else {
8912                 mutex_enter(&dtrace_lock);
8913         }
8914
8915 #if defined(sun)
8916         id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8917             VM_BESTFIT | VM_SLEEP);
8918 #else
8919         id = alloc_unr(dtrace_arena);
8920 #endif
8921         probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
8922
8923         probe->dtpr_id = id;
8924         probe->dtpr_gen = dtrace_probegen++;
8925         probe->dtpr_mod = dtrace_strdup(mod);
8926         probe->dtpr_func = dtrace_strdup(func);
8927         probe->dtpr_name = dtrace_strdup(name);
8928         probe->dtpr_arg = arg;
8929         probe->dtpr_aframes = aframes;
8930         probe->dtpr_provider = provider;
8931
8932         dtrace_hash_add(dtrace_bymod, probe);
8933         dtrace_hash_add(dtrace_byfunc, probe);
8934         dtrace_hash_add(dtrace_byname, probe);
8935
8936         if (id - 1 >= dtrace_nprobes) {
8937                 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8938                 size_t nsize = osize << 1;
8939
8940                 if (nsize == 0) {
8941                         ASSERT(osize == 0);
8942                         ASSERT(dtrace_probes == NULL);
8943                         nsize = sizeof (dtrace_probe_t *);
8944                 }
8945
8946                 probes = kmem_zalloc(nsize, KM_SLEEP);
8947
8948                 if (dtrace_probes == NULL) {
8949                         ASSERT(osize == 0);
8950                         dtrace_probes = probes;
8951                         dtrace_nprobes = 1;
8952                 } else {
8953                         dtrace_probe_t **oprobes = dtrace_probes;
8954
8955                         bcopy(oprobes, probes, osize);
8956                         dtrace_membar_producer();
8957                         dtrace_probes = probes;
8958
8959                         dtrace_sync();
8960
8961                         /*
8962                          * All CPUs are now seeing the new probes array; we can
8963                          * safely free the old array.
8964                          */
8965                         kmem_free(oprobes, osize);
8966                         dtrace_nprobes <<= 1;
8967                 }
8968
8969                 ASSERT(id - 1 < dtrace_nprobes);
8970         }
8971
8972         ASSERT(dtrace_probes[id - 1] == NULL);
8973         dtrace_probes[id - 1] = probe;
8974
8975         if (provider != dtrace_provider)
8976                 mutex_exit(&dtrace_lock);
8977
8978         return (id);
8979 }
8980
8981 static dtrace_probe_t *
8982 dtrace_probe_lookup_id(dtrace_id_t id)
8983 {
8984         ASSERT(MUTEX_HELD(&dtrace_lock));
8985
8986         if (id == 0 || id > dtrace_nprobes)
8987                 return (NULL);
8988
8989         return (dtrace_probes[id - 1]);
8990 }
8991
8992 static int
8993 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
8994 {
8995         *((dtrace_id_t *)arg) = probe->dtpr_id;
8996
8997         return (DTRACE_MATCH_DONE);
8998 }
8999
9000 /*
9001  * Look up a probe based on provider and one or more of module name, function
9002  * name and probe name.
9003  */
9004 dtrace_id_t
9005 dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
9006     char *func, char *name)
9007 {
9008         dtrace_probekey_t pkey;
9009         dtrace_id_t id;
9010         int match;
9011
9012         pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
9013         pkey.dtpk_pmatch = &dtrace_match_string;
9014         pkey.dtpk_mod = mod;
9015         pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9016         pkey.dtpk_func = func;
9017         pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9018         pkey.dtpk_name = name;
9019         pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9020         pkey.dtpk_id = DTRACE_IDNONE;
9021
9022         mutex_enter(&dtrace_lock);
9023         match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9024             dtrace_probe_lookup_match, &id);
9025         mutex_exit(&dtrace_lock);
9026
9027         ASSERT(match == 1 || match == 0);
9028         return (match ? id : 0);
9029 }
9030
9031 /*
9032  * Returns the probe argument associated with the specified probe.
9033  */
9034 void *
9035 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9036 {
9037         dtrace_probe_t *probe;
9038         void *rval = NULL;
9039
9040         mutex_enter(&dtrace_lock);
9041
9042         if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9043             probe->dtpr_provider == (dtrace_provider_t *)id)
9044                 rval = probe->dtpr_arg;
9045
9046         mutex_exit(&dtrace_lock);
9047
9048         return (rval);
9049 }
9050
9051 /*
9052  * Copy a probe into a probe description.
9053  */
9054 static void
9055 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9056 {
9057         bzero(pdp, sizeof (dtrace_probedesc_t));
9058         pdp->dtpd_id = prp->dtpr_id;
9059
9060         (void) strncpy(pdp->dtpd_provider,
9061             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
9062
9063         (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
9064         (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
9065         (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
9066 }
9067
9068 /*
9069  * Called to indicate that a probe -- or probes -- should be provided by a
9070  * specfied provider.  If the specified description is NULL, the provider will
9071  * be told to provide all of its probes.  (This is done whenever a new
9072  * consumer comes along, or whenever a retained enabling is to be matched.) If
9073  * the specified description is non-NULL, the provider is given the
9074  * opportunity to dynamically provide the specified probe, allowing providers
9075  * to support the creation of probes on-the-fly.  (So-called _autocreated_
9076  * probes.)  If the provider is NULL, the operations will be applied to all
9077  * providers; if the provider is non-NULL the operations will only be applied
9078  * to the specified provider.  The dtrace_provider_lock must be held, and the
9079  * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9080  * will need to grab the dtrace_lock when it reenters the framework through
9081  * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9082  */
9083 static void
9084 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9085 {
9086 #if defined(sun)
9087         modctl_t *ctl;
9088 #endif
9089         int all = 0;
9090
9091         ASSERT(MUTEX_HELD(&dtrace_provider_lock));
9092
9093         if (prv == NULL) {
9094                 all = 1;
9095                 prv = dtrace_provider;
9096         }
9097
9098         do {
9099                 /*
9100                  * First, call the blanket provide operation.
9101                  */
9102                 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9103
9104 #if defined(sun)
9105                 /*
9106                  * Now call the per-module provide operation.  We will grab
9107                  * mod_lock to prevent the list from being modified.  Note
9108                  * that this also prevents the mod_busy bits from changing.
9109                  * (mod_busy can only be changed with mod_lock held.)
9110                  */
9111                 mutex_enter(&mod_lock);
9112
9113                 ctl = &modules;
9114                 do {
9115                         if (ctl->mod_busy || ctl->mod_mp == NULL)
9116                                 continue;
9117
9118                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9119
9120                 } while ((ctl = ctl->mod_next) != &modules);
9121
9122                 mutex_exit(&mod_lock);
9123 #endif
9124         } while (all && (prv = prv->dtpv_next) != NULL);
9125 }
9126
9127 #if defined(sun)
9128 /*
9129  * Iterate over each probe, and call the Framework-to-Provider API function
9130  * denoted by offs.
9131  */
9132 static void
9133 dtrace_probe_foreach(uintptr_t offs)
9134 {
9135         dtrace_provider_t *prov;
9136         void (*func)(void *, dtrace_id_t, void *);
9137         dtrace_probe_t *probe;
9138         dtrace_icookie_t cookie;
9139         int i;
9140
9141         /*
9142          * We disable interrupts to walk through the probe array.  This is
9143          * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9144          * won't see stale data.
9145          */
9146         cookie = dtrace_interrupt_disable();
9147
9148         for (i = 0; i < dtrace_nprobes; i++) {
9149                 if ((probe = dtrace_probes[i]) == NULL)
9150                         continue;
9151
9152                 if (probe->dtpr_ecb == NULL) {
9153                         /*
9154                          * This probe isn't enabled -- don't call the function.
9155                          */
9156                         continue;
9157                 }
9158
9159                 prov = probe->dtpr_provider;
9160                 func = *((void(**)(void *, dtrace_id_t, void *))
9161                     ((uintptr_t)&prov->dtpv_pops + offs));
9162
9163                 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9164         }
9165
9166         dtrace_interrupt_enable(cookie);
9167 }
9168 #endif
9169
9170 static int
9171 dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
9172 {
9173         dtrace_probekey_t pkey;
9174         uint32_t priv;
9175         uid_t uid;
9176         zoneid_t zoneid;
9177
9178         ASSERT(MUTEX_HELD(&dtrace_lock));
9179         dtrace_ecb_create_cache = NULL;
9180
9181         if (desc == NULL) {
9182                 /*
9183                  * If we're passed a NULL description, we're being asked to
9184                  * create an ECB with a NULL probe.
9185                  */
9186                 (void) dtrace_ecb_create_enable(NULL, enab);
9187                 return (0);
9188         }
9189
9190         dtrace_probekey(desc, &pkey);
9191         dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9192             &priv, &uid, &zoneid);
9193
9194         return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
9195             enab));
9196 }
9197
9198 /*
9199  * DTrace Helper Provider Functions
9200  */
9201 static void
9202 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9203 {
9204         attr->dtat_name = DOF_ATTR_NAME(dofattr);
9205         attr->dtat_data = DOF_ATTR_DATA(dofattr);
9206         attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9207 }
9208
9209 static void
9210 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9211     const dof_provider_t *dofprov, char *strtab)
9212 {
9213         hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9214         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9215             dofprov->dofpv_provattr);
9216         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9217             dofprov->dofpv_modattr);
9218         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9219             dofprov->dofpv_funcattr);
9220         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9221             dofprov->dofpv_nameattr);
9222         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9223             dofprov->dofpv_argsattr);
9224 }
9225
9226 static void
9227 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9228 {
9229         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9230         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9231         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9232         dof_provider_t *provider;
9233         dof_probe_t *probe;
9234         uint32_t *off, *enoff;
9235         uint8_t *arg;
9236         char *strtab;
9237         uint_t i, nprobes;
9238         dtrace_helper_provdesc_t dhpv;
9239         dtrace_helper_probedesc_t dhpb;
9240         dtrace_meta_t *meta = dtrace_meta_pid;
9241         dtrace_mops_t *mops = &meta->dtm_mops;
9242         void *parg;
9243
9244         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9245         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9246             provider->dofpv_strtab * dof->dofh_secsize);
9247         prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9248             provider->dofpv_probes * dof->dofh_secsize);
9249         arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9250             provider->dofpv_prargs * dof->dofh_secsize);
9251         off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9252             provider->dofpv_proffs * dof->dofh_secsize);
9253
9254         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9255         off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9256         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9257         enoff = NULL;
9258
9259         /*
9260          * See dtrace_helper_provider_validate().
9261          */
9262         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9263             provider->dofpv_prenoffs != DOF_SECT_NONE) {
9264                 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9265                     provider->dofpv_prenoffs * dof->dofh_secsize);
9266                 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9267         }
9268
9269         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9270
9271         /*
9272          * Create the provider.
9273          */
9274         dtrace_dofprov2hprov(&dhpv, provider, strtab);
9275
9276         if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
9277                 return;
9278
9279         meta->dtm_count++;
9280
9281         /*
9282          * Create the probes.
9283          */
9284         for (i = 0; i < nprobes; i++) {
9285                 probe = (dof_probe_t *)(uintptr_t)(daddr +
9286                     prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9287
9288                 dhpb.dthpb_mod = dhp->dofhp_mod;
9289                 dhpb.dthpb_func = strtab + probe->dofpr_func;
9290                 dhpb.dthpb_name = strtab + probe->dofpr_name;
9291                 dhpb.dthpb_base = probe->dofpr_addr;
9292                 dhpb.dthpb_offs = off + probe->dofpr_offidx;
9293                 dhpb.dthpb_noffs = probe->dofpr_noffs;
9294                 if (enoff != NULL) {
9295                         dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
9296                         dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9297                 } else {
9298                         dhpb.dthpb_enoffs = NULL;
9299                         dhpb.dthpb_nenoffs = 0;
9300                 }
9301                 dhpb.dthpb_args = arg + probe->dofpr_argidx;
9302                 dhpb.dthpb_nargc = probe->dofpr_nargc;
9303                 dhpb.dthpb_xargc = probe->dofpr_xargc;
9304                 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9305                 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9306
9307                 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9308         }
9309 }
9310
9311 static void
9312 dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
9313 {
9314         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9315         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9316         int i;
9317
9318         ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9319
9320         for (i = 0; i < dof->dofh_secnum; i++) {
9321                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9322                     dof->dofh_secoff + i * dof->dofh_secsize);
9323
9324                 if (sec->dofs_type != DOF_SECT_PROVIDER)
9325                         continue;
9326
9327                 dtrace_helper_provide_one(dhp, sec, pid);
9328         }
9329
9330         /*
9331          * We may have just created probes, so we must now rematch against
9332          * any retained enablings.  Note that this call will acquire both
9333          * cpu_lock and dtrace_lock; the fact that we are holding
9334          * dtrace_meta_lock now is what defines the ordering with respect to
9335          * these three locks.
9336          */
9337         dtrace_enabling_matchall();
9338 }
9339
9340 static void
9341 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9342 {
9343         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9344         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9345         dof_sec_t *str_sec;
9346         dof_provider_t *provider;
9347         char *strtab;
9348         dtrace_helper_provdesc_t dhpv;
9349         dtrace_meta_t *meta = dtrace_meta_pid;
9350         dtrace_mops_t *mops = &meta->dtm_mops;
9351
9352         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9353         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9354             provider->dofpv_strtab * dof->dofh_secsize);
9355
9356         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9357
9358         /*
9359          * Create the provider.
9360          */
9361         dtrace_dofprov2hprov(&dhpv, provider, strtab);
9362
9363         mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
9364
9365         meta->dtm_count--;
9366 }
9367
9368 static void
9369 dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
9370 {
9371         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9372         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9373         int i;
9374
9375         ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9376
9377         for (i = 0; i < dof->dofh_secnum; i++) {
9378                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9379                     dof->dofh_secoff + i * dof->dofh_secsize);
9380
9381                 if (sec->dofs_type != DOF_SECT_PROVIDER)
9382                         continue;
9383
9384                 dtrace_helper_provider_remove_one(dhp, sec, pid);
9385         }
9386 }
9387
9388 /*
9389  * DTrace Meta Provider-to-Framework API Functions
9390  *
9391  * These functions implement the Meta Provider-to-Framework API, as described
9392  * in <sys/dtrace.h>.
9393  */
9394 int
9395 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9396     dtrace_meta_provider_id_t *idp)
9397 {
9398         dtrace_meta_t *meta;
9399         dtrace_helpers_t *help, *next;
9400         int i;
9401
9402         *idp = DTRACE_METAPROVNONE;
9403
9404         /*
9405          * We strictly don't need the name, but we hold onto it for
9406          * debuggability. All hail error queues!
9407          */
9408         if (name == NULL) {
9409                 cmn_err(CE_WARN, "failed to register meta-provider: "
9410                     "invalid name");
9411                 return (EINVAL);
9412         }
9413
9414         if (mops == NULL ||
9415             mops->dtms_create_probe == NULL ||
9416             mops->dtms_provide_pid == NULL ||
9417             mops->dtms_remove_pid == NULL) {
9418                 cmn_err(CE_WARN, "failed to register meta-register %s: "
9419                     "invalid ops", name);
9420                 return (EINVAL);
9421         }
9422
9423         meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9424         meta->dtm_mops = *mops;
9425         meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
9426         (void) strcpy(meta->dtm_name, name);
9427         meta->dtm_arg = arg;
9428
9429         mutex_enter(&dtrace_meta_lock);
9430         mutex_enter(&dtrace_lock);
9431
9432         if (dtrace_meta_pid != NULL) {
9433                 mutex_exit(&dtrace_lock);
9434                 mutex_exit(&dtrace_meta_lock);
9435                 cmn_err(CE_WARN, "failed to register meta-register %s: "
9436                     "user-land meta-provider exists", name);
9437                 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
9438                 kmem_free(meta, sizeof (dtrace_meta_t));
9439                 return (EINVAL);
9440         }
9441
9442         dtrace_meta_pid = meta;
9443         *idp = (dtrace_meta_provider_id_t)meta;
9444
9445         /*
9446          * If there are providers and probes ready to go, pass them
9447          * off to the new meta provider now.
9448          */
9449
9450         help = dtrace_deferred_pid;
9451         dtrace_deferred_pid = NULL;
9452
9453         mutex_exit(&dtrace_lock);
9454
9455         while (help != NULL) {
9456                 for (i = 0; i < help->dthps_nprovs; i++) {
9457                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9458                             help->dthps_pid);
9459                 }
9460
9461                 next = help->dthps_next;
9462                 help->dthps_next = NULL;
9463                 help->dthps_prev = NULL;
9464                 help->dthps_deferred = 0;
9465                 help = next;
9466         }
9467
9468         mutex_exit(&dtrace_meta_lock);
9469
9470         return (0);
9471 }
9472
9473 int
9474 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9475 {
9476         dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9477
9478         mutex_enter(&dtrace_meta_lock);
9479         mutex_enter(&dtrace_lock);
9480
9481         if (old == dtrace_meta_pid) {
9482                 pp = &dtrace_meta_pid;
9483         } else {
9484                 panic("attempt to unregister non-existent "
9485                     "dtrace meta-provider %p\n", (void *)old);
9486         }
9487
9488         if (old->dtm_count != 0) {
9489                 mutex_exit(&dtrace_lock);
9490                 mutex_exit(&dtrace_meta_lock);
9491                 return (EBUSY);
9492         }
9493
9494         *pp = NULL;
9495
9496         mutex_exit(&dtrace_lock);
9497         mutex_exit(&dtrace_meta_lock);
9498
9499         kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
9500         kmem_free(old, sizeof (dtrace_meta_t));
9501
9502         return (0);
9503 }
9504
9505
9506 /*
9507  * DTrace DIF Object Functions
9508  */
9509 static int
9510 dtrace_difo_err(uint_t pc, const char *format, ...)
9511 {
9512         if (dtrace_err_verbose) {
9513                 va_list alist;
9514
9515                 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9516                 va_start(alist, format);
9517                 (void) vuprintf(format, alist);
9518                 va_end(alist);
9519         }
9520
9521 #ifdef DTRACE_ERRDEBUG
9522         dtrace_errdebug(format);
9523 #endif
9524         return (1);
9525 }
9526
9527 /*
9528  * Validate a DTrace DIF object by checking the IR instructions.  The following
9529  * rules are currently enforced by dtrace_difo_validate():
9530  *
9531  * 1. Each instruction must have a valid opcode
9532  * 2. Each register, string, variable, or subroutine reference must be valid
9533  * 3. No instruction can modify register %r0 (must be zero)
9534  * 4. All instruction reserved bits must be set to zero
9535  * 5. The last instruction must be a "ret" instruction
9536  * 6. All branch targets must reference a valid instruction _after_ the branch
9537  */
9538 static int
9539 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9540     cred_t *cr)
9541 {
9542         int err = 0, i;
9543         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9544         int kcheckload;
9545         uint_t pc;
9546
9547         kcheckload = cr == NULL ||
9548             (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9549
9550         dp->dtdo_destructive = 0;
9551
9552         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9553                 dif_instr_t instr = dp->dtdo_buf[pc];
9554
9555                 uint_t r1 = DIF_INSTR_R1(instr);
9556                 uint_t r2 = DIF_INSTR_R2(instr);
9557                 uint_t rd = DIF_INSTR_RD(instr);
9558                 uint_t rs = DIF_INSTR_RS(instr);
9559                 uint_t label = DIF_INSTR_LABEL(instr);
9560                 uint_t v = DIF_INSTR_VAR(instr);
9561                 uint_t subr = DIF_INSTR_SUBR(instr);
9562                 uint_t type = DIF_INSTR_TYPE(instr);
9563                 uint_t op = DIF_INSTR_OP(instr);
9564
9565                 switch (op) {
9566                 case DIF_OP_OR:
9567                 case DIF_OP_XOR:
9568                 case DIF_OP_AND:
9569                 case DIF_OP_SLL:
9570                 case DIF_OP_SRL:
9571                 case DIF_OP_SRA:
9572                 case DIF_OP_SUB:
9573                 case DIF_OP_ADD:
9574                 case DIF_OP_MUL:
9575                 case DIF_OP_SDIV:
9576                 case DIF_OP_UDIV:
9577                 case DIF_OP_SREM:
9578                 case DIF_OP_UREM:
9579                 case DIF_OP_COPYS:
9580                         if (r1 >= nregs)
9581                                 err += efunc(pc, "invalid register %u\n", r1);
9582                         if (r2 >= nregs)
9583                                 err += efunc(pc, "invalid register %u\n", r2);
9584                         if (rd >= nregs)
9585                                 err += efunc(pc, "invalid register %u\n", rd);
9586                         if (rd == 0)
9587                                 err += efunc(pc, "cannot write to %r0\n");
9588                         break;
9589                 case DIF_OP_NOT:
9590                 case DIF_OP_MOV:
9591                 case DIF_OP_ALLOCS:
9592                         if (r1 >= nregs)
9593                                 err += efunc(pc, "invalid register %u\n", r1);
9594                         if (r2 != 0)
9595                                 err += efunc(pc, "non-zero reserved bits\n");
9596                         if (rd >= nregs)
9597                                 err += efunc(pc, "invalid register %u\n", rd);
9598                         if (rd == 0)
9599                                 err += efunc(pc, "cannot write to %r0\n");
9600                         break;
9601                 case DIF_OP_LDSB:
9602                 case DIF_OP_LDSH:
9603                 case DIF_OP_LDSW:
9604                 case DIF_OP_LDUB:
9605                 case DIF_OP_LDUH:
9606                 case DIF_OP_LDUW:
9607                 case DIF_OP_LDX:
9608                         if (r1 >= nregs)
9609                                 err += efunc(pc, "invalid register %u\n", r1);
9610                         if (r2 != 0)
9611                                 err += efunc(pc, "non-zero reserved bits\n");
9612                         if (rd >= nregs)
9613                                 err += efunc(pc, "invalid register %u\n", rd);
9614                         if (rd == 0)
9615                                 err += efunc(pc, "cannot write to %r0\n");
9616                         if (kcheckload)
9617                                 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9618                                     DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9619                         break;
9620                 case DIF_OP_RLDSB:
9621                 case DIF_OP_RLDSH:
9622                 case DIF_OP_RLDSW:
9623                 case DIF_OP_RLDUB:
9624                 case DIF_OP_RLDUH:
9625                 case DIF_OP_RLDUW:
9626                 case DIF_OP_RLDX:
9627                         if (r1 >= nregs)
9628                                 err += efunc(pc, "invalid register %u\n", r1);
9629                         if (r2 != 0)
9630                                 err += efunc(pc, "non-zero reserved bits\n");
9631                         if (rd >= nregs)
9632                                 err += efunc(pc, "invalid register %u\n", rd);
9633                         if (rd == 0)
9634                                 err += efunc(pc, "cannot write to %r0\n");
9635                         break;
9636                 case DIF_OP_ULDSB:
9637                 case DIF_OP_ULDSH:
9638                 case DIF_OP_ULDSW:
9639                 case DIF_OP_ULDUB:
9640                 case DIF_OP_ULDUH:
9641                 case DIF_OP_ULDUW:
9642                 case DIF_OP_ULDX:
9643                         if (r1 >= nregs)
9644                                 err += efunc(pc, "invalid register %u\n", r1);
9645                         if (r2 != 0)
9646                                 err += efunc(pc, "non-zero reserved bits\n");
9647                         if (rd >= nregs)
9648                                 err += efunc(pc, "invalid register %u\n", rd);
9649                         if (rd == 0)
9650                                 err += efunc(pc, "cannot write to %r0\n");
9651                         break;
9652                 case DIF_OP_STB:
9653                 case DIF_OP_STH:
9654                 case DIF_OP_STW:
9655                 case DIF_OP_STX:
9656                         if (r1 >= nregs)
9657                                 err += efunc(pc, "invalid register %u\n", r1);
9658                         if (r2 != 0)
9659                                 err += efunc(pc, "non-zero reserved bits\n");
9660                         if (rd >= nregs)
9661                                 err += efunc(pc, "invalid register %u\n", rd);
9662                         if (rd == 0)
9663                                 err += efunc(pc, "cannot write to 0 address\n");
9664                         break;
9665                 case DIF_OP_CMP:
9666                 case DIF_OP_SCMP:
9667                         if (r1 >= nregs)
9668                                 err += efunc(pc, "invalid register %u\n", r1);
9669                         if (r2 >= nregs)
9670                                 err += efunc(pc, "invalid register %u\n", r2);
9671                         if (rd != 0)
9672                                 err += efunc(pc, "non-zero reserved bits\n");
9673                         break;
9674                 case DIF_OP_TST:
9675                         if (r1 >= nregs)
9676                                 err += efunc(pc, "invalid register %u\n", r1);
9677                         if (r2 != 0 || rd != 0)
9678                                 err += efunc(pc, "non-zero reserved bits\n");
9679                         break;
9680                 case DIF_OP_BA:
9681                 case DIF_OP_BE:
9682                 case DIF_OP_BNE:
9683                 case DIF_OP_BG:
9684                 case DIF_OP_BGU:
9685                 case DIF_OP_BGE:
9686                 case DIF_OP_BGEU:
9687                 case DIF_OP_BL:
9688                 case DIF_OP_BLU:
9689                 case DIF_OP_BLE:
9690                 case DIF_OP_BLEU:
9691                         if (label >= dp->dtdo_len) {
9692                                 err += efunc(pc, "invalid branch target %u\n",
9693                                     label);
9694                         }
9695                         if (label <= pc) {
9696                                 err += efunc(pc, "backward branch to %u\n",
9697                                     label);
9698                         }
9699                         break;
9700                 case DIF_OP_RET:
9701                         if (r1 != 0 || r2 != 0)
9702                                 err += efunc(pc, "non-zero reserved bits\n");
9703                         if (rd >= nregs)
9704                                 err += efunc(pc, "invalid register %u\n", rd);
9705                         break;
9706                 case DIF_OP_NOP:
9707                 case DIF_OP_POPTS:
9708                 case DIF_OP_FLUSHTS:
9709                         if (r1 != 0 || r2 != 0 || rd != 0)
9710                                 err += efunc(pc, "non-zero reserved bits\n");
9711                         break;
9712                 case DIF_OP_SETX:
9713                         if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9714                                 err += efunc(pc, "invalid integer ref %u\n",
9715                                     DIF_INSTR_INTEGER(instr));
9716                         }
9717                         if (rd >= nregs)
9718                                 err += efunc(pc, "invalid register %u\n", rd);
9719                         if (rd == 0)
9720                                 err += efunc(pc, "cannot write to %r0\n");
9721                         break;
9722                 case DIF_OP_SETS:
9723                         if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9724                                 err += efunc(pc, "invalid string ref %u\n",
9725                                     DIF_INSTR_STRING(instr));
9726                         }
9727                         if (rd >= nregs)
9728                                 err += efunc(pc, "invalid register %u\n", rd);
9729                         if (rd == 0)
9730                                 err += efunc(pc, "cannot write to %r0\n");
9731                         break;
9732                 case DIF_OP_LDGA:
9733                 case DIF_OP_LDTA:
9734                         if (r1 > DIF_VAR_ARRAY_MAX)
9735                                 err += efunc(pc, "invalid array %u\n", r1);
9736                         if (r2 >= nregs)
9737                                 err += efunc(pc, "invalid register %u\n", r2);
9738                         if (rd >= nregs)
9739                                 err += efunc(pc, "invalid register %u\n", rd);
9740                         if (rd == 0)
9741                                 err += efunc(pc, "cannot write to %r0\n");
9742                         break;
9743                 case DIF_OP_LDGS:
9744                 case DIF_OP_LDTS:
9745                 case DIF_OP_LDLS:
9746                 case DIF_OP_LDGAA:
9747                 case DIF_OP_LDTAA:
9748                         if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9749                                 err += efunc(pc, "invalid variable %u\n", v);
9750                         if (rd >= nregs)
9751                                 err += efunc(pc, "invalid register %u\n", rd);
9752                         if (rd == 0)
9753                                 err += efunc(pc, "cannot write to %r0\n");
9754                         break;
9755                 case DIF_OP_STGS:
9756                 case DIF_OP_STTS:
9757                 case DIF_OP_STLS:
9758                 case DIF_OP_STGAA:
9759                 case DIF_OP_STTAA:
9760                         if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9761                                 err += efunc(pc, "invalid variable %u\n", v);
9762                         if (rs >= nregs)
9763                                 err += efunc(pc, "invalid register %u\n", rd);
9764                         break;
9765                 case DIF_OP_CALL:
9766                         if (subr > DIF_SUBR_MAX)
9767                                 err += efunc(pc, "invalid subr %u\n", subr);
9768                         if (rd >= nregs)
9769                                 err += efunc(pc, "invalid register %u\n", rd);
9770                         if (rd == 0)
9771                                 err += efunc(pc, "cannot write to %r0\n");
9772
9773                         if (subr == DIF_SUBR_COPYOUT ||
9774                             subr == DIF_SUBR_COPYOUTSTR) {
9775                                 dp->dtdo_destructive = 1;
9776                         }
9777
9778                         if (subr == DIF_SUBR_GETF) {
9779                                 /*
9780                                  * If we have a getf() we need to record that
9781                                  * in our state.  Note that our state can be
9782                                  * NULL if this is a helper -- but in that
9783                                  * case, the call to getf() is itself illegal,
9784                                  * and will be caught (slightly later) when
9785                                  * the helper is validated.
9786                                  */
9787                                 if (vstate->dtvs_state != NULL)
9788                                         vstate->dtvs_state->dts_getf++;
9789                         }
9790
9791                         break;
9792                 case DIF_OP_PUSHTR:
9793                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9794                                 err += efunc(pc, "invalid ref type %u\n", type);
9795                         if (r2 >= nregs)
9796                                 err += efunc(pc, "invalid register %u\n", r2);
9797                         if (rs >= nregs)
9798                                 err += efunc(pc, "invalid register %u\n", rs);
9799                         break;
9800                 case DIF_OP_PUSHTV:
9801                         if (type != DIF_TYPE_CTF)
9802                                 err += efunc(pc, "invalid val type %u\n", type);
9803                         if (r2 >= nregs)
9804                                 err += efunc(pc, "invalid register %u\n", r2);
9805                         if (rs >= nregs)
9806                                 err += efunc(pc, "invalid register %u\n", rs);
9807                         break;
9808                 default:
9809                         err += efunc(pc, "invalid opcode %u\n",
9810                             DIF_INSTR_OP(instr));
9811                 }
9812         }
9813
9814         if (dp->dtdo_len != 0 &&
9815             DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9816                 err += efunc(dp->dtdo_len - 1,
9817                     "expected 'ret' as last DIF instruction\n");
9818         }
9819
9820         if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9821                 /*
9822                  * If we're not returning by reference, the size must be either
9823                  * 0 or the size of one of the base types.
9824                  */
9825                 switch (dp->dtdo_rtype.dtdt_size) {
9826                 case 0:
9827                 case sizeof (uint8_t):
9828                 case sizeof (uint16_t):
9829                 case sizeof (uint32_t):
9830                 case sizeof (uint64_t):
9831                         break;
9832
9833                 default:
9834                         err += efunc(dp->dtdo_len - 1, "bad return size\n");
9835                 }
9836         }
9837
9838         for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9839                 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9840                 dtrace_diftype_t *vt, *et;
9841                 uint_t id, ndx;
9842
9843                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9844                     v->dtdv_scope != DIFV_SCOPE_THREAD &&
9845                     v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9846                         err += efunc(i, "unrecognized variable scope %d\n",
9847                             v->dtdv_scope);
9848                         break;
9849                 }
9850
9851                 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9852                     v->dtdv_kind != DIFV_KIND_SCALAR) {
9853                         err += efunc(i, "unrecognized variable type %d\n",
9854                             v->dtdv_kind);
9855                         break;
9856                 }
9857
9858                 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9859                         err += efunc(i, "%d exceeds variable id limit\n", id);
9860                         break;
9861                 }
9862
9863                 if (id < DIF_VAR_OTHER_UBASE)
9864                         continue;
9865
9866                 /*
9867                  * For user-defined variables, we need to check that this
9868                  * definition is identical to any previous definition that we
9869                  * encountered.
9870                  */
9871                 ndx = id - DIF_VAR_OTHER_UBASE;
9872
9873                 switch (v->dtdv_scope) {
9874                 case DIFV_SCOPE_GLOBAL:
9875                         if (ndx < vstate->dtvs_nglobals) {
9876                                 dtrace_statvar_t *svar;
9877
9878                                 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9879                                         existing = &svar->dtsv_var;
9880                         }
9881
9882                         break;
9883
9884                 case DIFV_SCOPE_THREAD:
9885                         if (ndx < vstate->dtvs_ntlocals)
9886                                 existing = &vstate->dtvs_tlocals[ndx];
9887                         break;
9888
9889                 case DIFV_SCOPE_LOCAL:
9890                         if (ndx < vstate->dtvs_nlocals) {
9891                                 dtrace_statvar_t *svar;
9892
9893                                 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9894                                         existing = &svar->dtsv_var;
9895                         }
9896
9897                         break;
9898                 }
9899
9900                 vt = &v->dtdv_type;
9901
9902                 if (vt->dtdt_flags & DIF_TF_BYREF) {
9903                         if (vt->dtdt_size == 0) {
9904                                 err += efunc(i, "zero-sized variable\n");
9905                                 break;
9906                         }
9907
9908                         if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
9909                             vt->dtdt_size > dtrace_global_maxsize) {
9910                                 err += efunc(i, "oversized by-ref global\n");
9911                                 break;
9912                         }
9913                 }
9914
9915                 if (existing == NULL || existing->dtdv_id == 0)
9916                         continue;
9917
9918                 ASSERT(existing->dtdv_id == v->dtdv_id);
9919                 ASSERT(existing->dtdv_scope == v->dtdv_scope);
9920
9921                 if (existing->dtdv_kind != v->dtdv_kind)
9922                         err += efunc(i, "%d changed variable kind\n", id);
9923
9924                 et = &existing->dtdv_type;
9925
9926                 if (vt->dtdt_flags != et->dtdt_flags) {
9927                         err += efunc(i, "%d changed variable type flags\n", id);
9928                         break;
9929                 }
9930
9931                 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9932                         err += efunc(i, "%d changed variable type size\n", id);
9933                         break;
9934                 }
9935         }
9936
9937         return (err);
9938 }
9939
9940 /*
9941  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9942  * are much more constrained than normal DIFOs.  Specifically, they may
9943  * not:
9944  *
9945  * 1. Make calls to subroutines other than copyin(), copyinstr() or
9946  *    miscellaneous string routines
9947  * 2. Access DTrace variables other than the args[] array, and the
9948  *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9949  * 3. Have thread-local variables.
9950  * 4. Have dynamic variables.
9951  */
9952 static int
9953 dtrace_difo_validate_helper(dtrace_difo_t *dp)
9954 {
9955         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9956         int err = 0;
9957         uint_t pc;
9958
9959         for (pc = 0; pc < dp->dtdo_len; pc++) {
9960                 dif_instr_t instr = dp->dtdo_buf[pc];
9961
9962                 uint_t v = DIF_INSTR_VAR(instr);
9963                 uint_t subr = DIF_INSTR_SUBR(instr);
9964                 uint_t op = DIF_INSTR_OP(instr);
9965
9966                 switch (op) {
9967                 case DIF_OP_OR:
9968                 case DIF_OP_XOR:
9969                 case DIF_OP_AND:
9970                 case DIF_OP_SLL:
9971                 case DIF_OP_SRL:
9972                 case DIF_OP_SRA:
9973                 case DIF_OP_SUB:
9974                 case DIF_OP_ADD:
9975                 case DIF_OP_MUL:
9976                 case DIF_OP_SDIV:
9977                 case DIF_OP_UDIV:
9978                 case DIF_OP_SREM:
9979                 case DIF_OP_UREM:
9980                 case DIF_OP_COPYS:
9981                 case DIF_OP_NOT:
9982                 case DIF_OP_MOV:
9983                 case DIF_OP_RLDSB:
9984                 case DIF_OP_RLDSH:
9985                 case DIF_OP_RLDSW:
9986                 case DIF_OP_RLDUB:
9987                 case DIF_OP_RLDUH:
9988                 case DIF_OP_RLDUW:
9989                 case DIF_OP_RLDX:
9990                 case DIF_OP_ULDSB:
9991                 case DIF_OP_ULDSH:
9992                 case DIF_OP_ULDSW:
9993                 case DIF_OP_ULDUB:
9994                 case DIF_OP_ULDUH:
9995                 case DIF_OP_ULDUW:
9996                 case DIF_OP_ULDX:
9997                 case DIF_OP_STB:
9998                 case DIF_OP_STH:
9999                 case DIF_OP_STW:
10000                 case DIF_OP_STX:
10001                 case DIF_OP_ALLOCS:
10002                 case DIF_OP_CMP:
10003                 case DIF_OP_SCMP:
10004                 case DIF_OP_TST:
10005                 case DIF_OP_BA:
10006                 case DIF_OP_BE:
10007                 case DIF_OP_BNE:
10008                 case DIF_OP_BG:
10009                 case DIF_OP_BGU:
10010                 case DIF_OP_BGE:
10011                 case DIF_OP_BGEU:
10012                 case DIF_OP_BL:
10013                 case DIF_OP_BLU:
10014                 case DIF_OP_BLE:
10015                 case DIF_OP_BLEU:
10016                 case DIF_OP_RET:
10017                 case DIF_OP_NOP:
10018                 case DIF_OP_POPTS:
10019                 case DIF_OP_FLUSHTS:
10020                 case DIF_OP_SETX:
10021                 case DIF_OP_SETS:
10022                 case DIF_OP_LDGA:
10023                 case DIF_OP_LDLS:
10024                 case DIF_OP_STGS:
10025                 case DIF_OP_STLS:
10026                 case DIF_OP_PUSHTR:
10027                 case DIF_OP_PUSHTV:
10028                         break;
10029
10030                 case DIF_OP_LDGS:
10031                         if (v >= DIF_VAR_OTHER_UBASE)
10032                                 break;
10033
10034                         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10035                                 break;
10036
10037                         if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10038                             v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10039                             v == DIF_VAR_EXECARGS ||
10040                             v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10041                             v == DIF_VAR_UID || v == DIF_VAR_GID)
10042                                 break;
10043
10044                         err += efunc(pc, "illegal variable %u\n", v);
10045                         break;
10046
10047                 case DIF_OP_LDTA:
10048                 case DIF_OP_LDTS:
10049                 case DIF_OP_LDGAA:
10050                 case DIF_OP_LDTAA:
10051                         err += efunc(pc, "illegal dynamic variable load\n");
10052                         break;
10053
10054                 case DIF_OP_STTS:
10055                 case DIF_OP_STGAA:
10056                 case DIF_OP_STTAA:
10057                         err += efunc(pc, "illegal dynamic variable store\n");
10058                         break;
10059
10060                 case DIF_OP_CALL:
10061                         if (subr == DIF_SUBR_ALLOCA ||
10062                             subr == DIF_SUBR_BCOPY ||
10063                             subr == DIF_SUBR_COPYIN ||
10064                             subr == DIF_SUBR_COPYINTO ||
10065                             subr == DIF_SUBR_COPYINSTR ||
10066                             subr == DIF_SUBR_INDEX ||
10067                             subr == DIF_SUBR_INET_NTOA ||
10068                             subr == DIF_SUBR_INET_NTOA6 ||
10069                             subr == DIF_SUBR_INET_NTOP ||
10070                             subr == DIF_SUBR_JSON ||
10071                             subr == DIF_SUBR_LLTOSTR ||
10072                             subr == DIF_SUBR_STRTOLL ||
10073                             subr == DIF_SUBR_RINDEX ||
10074                             subr == DIF_SUBR_STRCHR ||
10075                             subr == DIF_SUBR_STRJOIN ||
10076                             subr == DIF_SUBR_STRRCHR ||
10077                             subr == DIF_SUBR_STRSTR ||
10078                             subr == DIF_SUBR_HTONS ||
10079                             subr == DIF_SUBR_HTONL ||
10080                             subr == DIF_SUBR_HTONLL ||
10081                             subr == DIF_SUBR_NTOHS ||
10082                             subr == DIF_SUBR_NTOHL ||
10083                             subr == DIF_SUBR_NTOHLL ||
10084                             subr == DIF_SUBR_MEMREF ||
10085 #if !defined(sun)
10086                             subr == DIF_SUBR_MEMSTR ||
10087 #endif
10088                             subr == DIF_SUBR_TYPEREF)
10089                                 break;
10090
10091                         err += efunc(pc, "invalid subr %u\n", subr);
10092                         break;
10093
10094                 default:
10095                         err += efunc(pc, "invalid opcode %u\n",
10096                             DIF_INSTR_OP(instr));
10097                 }
10098         }
10099
10100         return (err);
10101 }
10102
10103 /*
10104  * Returns 1 if the expression in the DIF object can be cached on a per-thread
10105  * basis; 0 if not.
10106  */
10107 static int
10108 dtrace_difo_cacheable(dtrace_difo_t *dp)
10109 {
10110         int i;
10111
10112         if (dp == NULL)
10113                 return (0);
10114
10115         for (i = 0; i < dp->dtdo_varlen; i++) {
10116                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10117
10118                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10119                         continue;
10120
10121                 switch (v->dtdv_id) {
10122                 case DIF_VAR_CURTHREAD:
10123                 case DIF_VAR_PID:
10124                 case DIF_VAR_TID:
10125                 case DIF_VAR_EXECARGS:
10126                 case DIF_VAR_EXECNAME:
10127                 case DIF_VAR_ZONENAME:
10128                         break;
10129
10130                 default:
10131                         return (0);
10132                 }
10133         }
10134
10135         /*
10136          * This DIF object may be cacheable.  Now we need to look for any
10137          * array loading instructions, any memory loading instructions, or
10138          * any stores to thread-local variables.
10139          */
10140         for (i = 0; i < dp->dtdo_len; i++) {
10141                 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10142
10143                 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10144                     (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10145                     (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10146                     op == DIF_OP_LDGA || op == DIF_OP_STTS)
10147                         return (0);
10148         }
10149
10150         return (1);
10151 }
10152
10153 static void
10154 dtrace_difo_hold(dtrace_difo_t *dp)
10155 {
10156         int i;
10157
10158         ASSERT(MUTEX_HELD(&dtrace_lock));
10159
10160         dp->dtdo_refcnt++;
10161         ASSERT(dp->dtdo_refcnt != 0);
10162
10163         /*
10164          * We need to check this DIF object for references to the variable
10165          * DIF_VAR_VTIMESTAMP.
10166          */
10167         for (i = 0; i < dp->dtdo_varlen; i++) {
10168                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10169
10170                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10171                         continue;
10172
10173                 if (dtrace_vtime_references++ == 0)
10174                         dtrace_vtime_enable();
10175         }
10176 }
10177
10178 /*
10179  * This routine calculates the dynamic variable chunksize for a given DIF
10180  * object.  The calculation is not fool-proof, and can probably be tricked by
10181  * malicious DIF -- but it works for all compiler-generated DIF.  Because this
10182  * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10183  * if a dynamic variable size exceeds the chunksize.
10184  */
10185 static void
10186 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10187 {
10188         uint64_t sval = 0;
10189         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10190         const dif_instr_t *text = dp->dtdo_buf;
10191         uint_t pc, srd = 0;
10192         uint_t ttop = 0;
10193         size_t size, ksize;
10194         uint_t id, i;
10195
10196         for (pc = 0; pc < dp->dtdo_len; pc++) {
10197                 dif_instr_t instr = text[pc];
10198                 uint_t op = DIF_INSTR_OP(instr);
10199                 uint_t rd = DIF_INSTR_RD(instr);
10200                 uint_t r1 = DIF_INSTR_R1(instr);
10201                 uint_t nkeys = 0;
10202                 uchar_t scope = 0;
10203
10204                 dtrace_key_t *key = tupregs;
10205
10206                 switch (op) {
10207                 case DIF_OP_SETX:
10208                         sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10209                         srd = rd;
10210                         continue;
10211
10212                 case DIF_OP_STTS:
10213                         key = &tupregs[DIF_DTR_NREGS];
10214                         key[0].dttk_size = 0;
10215                         key[1].dttk_size = 0;
10216                         nkeys = 2;
10217                         scope = DIFV_SCOPE_THREAD;
10218                         break;
10219
10220                 case DIF_OP_STGAA:
10221                 case DIF_OP_STTAA:
10222                         nkeys = ttop;
10223
10224                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10225                                 key[nkeys++].dttk_size = 0;
10226
10227                         key[nkeys++].dttk_size = 0;
10228
10229                         if (op == DIF_OP_STTAA) {
10230                                 scope = DIFV_SCOPE_THREAD;
10231                         } else {
10232                                 scope = DIFV_SCOPE_GLOBAL;
10233                         }
10234
10235                         break;
10236
10237                 case DIF_OP_PUSHTR:
10238                         if (ttop == DIF_DTR_NREGS)
10239                                 return;
10240
10241                         if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10242                                 /*
10243                                  * If the register for the size of the "pushtr"
10244                                  * is %r0 (or the value is 0) and the type is
10245                                  * a string, we'll use the system-wide default
10246                                  * string size.
10247                                  */
10248                                 tupregs[ttop++].dttk_size =
10249                                     dtrace_strsize_default;
10250                         } else {
10251                                 if (srd == 0)
10252                                         return;
10253
10254                                 tupregs[ttop++].dttk_size = sval;
10255                         }
10256
10257                         break;
10258
10259                 case DIF_OP_PUSHTV:
10260                         if (ttop == DIF_DTR_NREGS)
10261                                 return;
10262
10263                         tupregs[ttop++].dttk_size = 0;
10264                         break;
10265
10266                 case DIF_OP_FLUSHTS:
10267                         ttop = 0;
10268                         break;
10269
10270                 case DIF_OP_POPTS:
10271                         if (ttop != 0)
10272                                 ttop--;
10273                         break;
10274                 }
10275
10276                 sval = 0;
10277                 srd = 0;
10278
10279                 if (nkeys == 0)
10280                         continue;
10281
10282                 /*
10283                  * We have a dynamic variable allocation; calculate its size.
10284                  */
10285                 for (ksize = 0, i = 0; i < nkeys; i++)
10286                         ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10287
10288                 size = sizeof (dtrace_dynvar_t);
10289                 size += sizeof (dtrace_key_t) * (nkeys - 1);
10290                 size += ksize;
10291
10292                 /*
10293                  * Now we need to determine the size of the stored data.
10294                  */
10295                 id = DIF_INSTR_VAR(instr);
10296
10297                 for (i = 0; i < dp->dtdo_varlen; i++) {
10298                         dtrace_difv_t *v = &dp->dtdo_vartab[i];
10299
10300                         if (v->dtdv_id == id && v->dtdv_scope == scope) {
10301                                 size += v->dtdv_type.dtdt_size;
10302                                 break;
10303                         }
10304                 }
10305
10306                 if (i == dp->dtdo_varlen)
10307                         return;
10308
10309                 /*
10310                  * We have the size.  If this is larger than the chunk size
10311                  * for our dynamic variable state, reset the chunk size.
10312                  */
10313                 size = P2ROUNDUP(size, sizeof (uint64_t));
10314
10315                 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10316                         vstate->dtvs_dynvars.dtds_chunksize = size;
10317         }
10318 }
10319
10320 static void
10321 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10322 {
10323         int i, oldsvars, osz, nsz, otlocals, ntlocals;
10324         uint_t id;
10325
10326         ASSERT(MUTEX_HELD(&dtrace_lock));
10327         ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10328
10329         for (i = 0; i < dp->dtdo_varlen; i++) {
10330                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10331                 dtrace_statvar_t *svar, ***svarp = NULL;
10332                 size_t dsize = 0;
10333                 uint8_t scope = v->dtdv_scope;
10334                 int *np = NULL;
10335
10336                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10337                         continue;
10338
10339                 id -= DIF_VAR_OTHER_UBASE;
10340
10341                 switch (scope) {
10342                 case DIFV_SCOPE_THREAD:
10343                         while (id >= (otlocals = vstate->dtvs_ntlocals)) {
10344                                 dtrace_difv_t *tlocals;
10345
10346                                 if ((ntlocals = (otlocals << 1)) == 0)
10347                                         ntlocals = 1;
10348
10349                                 osz = otlocals * sizeof (dtrace_difv_t);
10350                                 nsz = ntlocals * sizeof (dtrace_difv_t);
10351
10352                                 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10353
10354                                 if (osz != 0) {
10355                                         bcopy(vstate->dtvs_tlocals,
10356                                             tlocals, osz);
10357                                         kmem_free(vstate->dtvs_tlocals, osz);
10358                                 }
10359
10360                                 vstate->dtvs_tlocals = tlocals;
10361                                 vstate->dtvs_ntlocals = ntlocals;
10362                         }
10363
10364                         vstate->dtvs_tlocals[id] = *v;
10365                         continue;
10366
10367                 case DIFV_SCOPE_LOCAL:
10368                         np = &vstate->dtvs_nlocals;
10369                         svarp = &vstate->dtvs_locals;
10370
10371                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10372                                 dsize = NCPU * (v->dtdv_type.dtdt_size +
10373                                     sizeof (uint64_t));
10374                         else
10375                                 dsize = NCPU * sizeof (uint64_t);
10376
10377                         break;
10378
10379                 case DIFV_SCOPE_GLOBAL:
10380                         np = &vstate->dtvs_nglobals;
10381                         svarp = &vstate->dtvs_globals;
10382
10383                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10384                                 dsize = v->dtdv_type.dtdt_size +
10385                                     sizeof (uint64_t);
10386
10387                         break;
10388
10389                 default:
10390                         ASSERT(0);
10391                 }
10392
10393                 while (id >= (oldsvars = *np)) {
10394                         dtrace_statvar_t **statics;
10395                         int newsvars, oldsize, newsize;
10396
10397                         if ((newsvars = (oldsvars << 1)) == 0)
10398                                 newsvars = 1;
10399
10400                         oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10401                         newsize = newsvars * sizeof (dtrace_statvar_t *);
10402
10403                         statics = kmem_zalloc(newsize, KM_SLEEP);
10404
10405                         if (oldsize != 0) {
10406                                 bcopy(*svarp, statics, oldsize);
10407                                 kmem_free(*svarp, oldsize);
10408                         }
10409
10410                         *svarp = statics;
10411                         *np = newsvars;
10412                 }
10413
10414                 if ((svar = (*svarp)[id]) == NULL) {
10415                         svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10416                         svar->dtsv_var = *v;
10417
10418                         if ((svar->dtsv_size = dsize) != 0) {
10419                                 svar->dtsv_data = (uint64_t)(uintptr_t)
10420                                     kmem_zalloc(dsize, KM_SLEEP);
10421                         }
10422
10423                         (*svarp)[id] = svar;
10424                 }
10425
10426                 svar->dtsv_refcnt++;
10427         }
10428
10429         dtrace_difo_chunksize(dp, vstate);
10430         dtrace_difo_hold(dp);
10431 }
10432
10433 static dtrace_difo_t *
10434 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10435 {
10436         dtrace_difo_t *new;
10437         size_t sz;
10438
10439         ASSERT(dp->dtdo_buf != NULL);
10440         ASSERT(dp->dtdo_refcnt != 0);
10441
10442         new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10443
10444         ASSERT(dp->dtdo_buf != NULL);
10445         sz = dp->dtdo_len * sizeof (dif_instr_t);
10446         new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10447         bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10448         new->dtdo_len = dp->dtdo_len;
10449
10450         if (dp->dtdo_strtab != NULL) {
10451                 ASSERT(dp->dtdo_strlen != 0);
10452                 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10453                 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10454                 new->dtdo_strlen = dp->dtdo_strlen;
10455         }
10456
10457         if (dp->dtdo_inttab != NULL) {
10458                 ASSERT(dp->dtdo_intlen != 0);
10459                 sz = dp->dtdo_intlen * sizeof (uint64_t);
10460                 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10461                 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10462                 new->dtdo_intlen = dp->dtdo_intlen;
10463         }
10464
10465         if (dp->dtdo_vartab != NULL) {
10466                 ASSERT(dp->dtdo_varlen != 0);
10467                 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10468                 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10469                 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10470                 new->dtdo_varlen = dp->dtdo_varlen;
10471         }
10472
10473         dtrace_difo_init(new, vstate);
10474         return (new);
10475 }
10476
10477 static void
10478 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10479 {
10480         int i;
10481
10482         ASSERT(dp->dtdo_refcnt == 0);
10483
10484         for (i = 0; i < dp->dtdo_varlen; i++) {
10485                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10486                 dtrace_statvar_t *svar, **svarp = NULL;
10487                 uint_t id;
10488                 uint8_t scope = v->dtdv_scope;
10489                 int *np = NULL;
10490
10491                 switch (scope) {
10492                 case DIFV_SCOPE_THREAD:
10493                         continue;
10494
10495                 case DIFV_SCOPE_LOCAL:
10496                         np = &vstate->dtvs_nlocals;
10497                         svarp = vstate->dtvs_locals;
10498                         break;
10499
10500                 case DIFV_SCOPE_GLOBAL:
10501                         np = &vstate->dtvs_nglobals;
10502                         svarp = vstate->dtvs_globals;
10503                         break;
10504
10505                 default:
10506                         ASSERT(0);
10507                 }
10508
10509                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10510                         continue;
10511
10512                 id -= DIF_VAR_OTHER_UBASE;
10513                 ASSERT(id < *np);
10514
10515                 svar = svarp[id];
10516                 ASSERT(svar != NULL);
10517                 ASSERT(svar->dtsv_refcnt > 0);
10518
10519                 if (--svar->dtsv_refcnt > 0)
10520                         continue;
10521
10522                 if (svar->dtsv_size != 0) {
10523                         ASSERT(svar->dtsv_data != 0);
10524                         kmem_free((void *)(uintptr_t)svar->dtsv_data,
10525                             svar->dtsv_size);
10526                 }
10527
10528                 kmem_free(svar, sizeof (dtrace_statvar_t));
10529                 svarp[id] = NULL;
10530         }
10531
10532         if (dp->dtdo_buf != NULL)
10533                 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10534         if (dp->dtdo_inttab != NULL)
10535                 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10536         if (dp->dtdo_strtab != NULL)
10537                 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10538         if (dp->dtdo_vartab != NULL)
10539                 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10540
10541         kmem_free(dp, sizeof (dtrace_difo_t));
10542 }
10543
10544 static void
10545 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10546 {
10547         int i;
10548
10549         ASSERT(MUTEX_HELD(&dtrace_lock));
10550         ASSERT(dp->dtdo_refcnt != 0);
10551
10552         for (i = 0; i < dp->dtdo_varlen; i++) {
10553                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10554
10555                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10556                         continue;
10557
10558                 ASSERT(dtrace_vtime_references > 0);
10559                 if (--dtrace_vtime_references == 0)
10560                         dtrace_vtime_disable();
10561         }
10562
10563         if (--dp->dtdo_refcnt == 0)
10564                 dtrace_difo_destroy(dp, vstate);
10565 }
10566
10567 /*
10568  * DTrace Format Functions
10569  */
10570 static uint16_t
10571 dtrace_format_add(dtrace_state_t *state, char *str)
10572 {
10573         char *fmt, **new;
10574         uint16_t ndx, len = strlen(str) + 1;
10575
10576         fmt = kmem_zalloc(len, KM_SLEEP);
10577         bcopy(str, fmt, len);
10578
10579         for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10580                 if (state->dts_formats[ndx] == NULL) {
10581                         state->dts_formats[ndx] = fmt;
10582                         return (ndx + 1);
10583                 }
10584         }
10585
10586         if (state->dts_nformats == USHRT_MAX) {
10587                 /*
10588                  * This is only likely if a denial-of-service attack is being
10589                  * attempted.  As such, it's okay to fail silently here.
10590                  */
10591                 kmem_free(fmt, len);
10592                 return (0);
10593         }
10594
10595         /*
10596          * For simplicity, we always resize the formats array to be exactly the
10597          * number of formats.
10598          */
10599         ndx = state->dts_nformats++;
10600         new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10601
10602         if (state->dts_formats != NULL) {
10603                 ASSERT(ndx != 0);
10604                 bcopy(state->dts_formats, new, ndx * sizeof (char *));
10605                 kmem_free(state->dts_formats, ndx * sizeof (char *));
10606         }
10607
10608         state->dts_formats = new;
10609         state->dts_formats[ndx] = fmt;
10610
10611         return (ndx + 1);
10612 }
10613
10614 static void
10615 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10616 {
10617         char *fmt;
10618
10619         ASSERT(state->dts_formats != NULL);
10620         ASSERT(format <= state->dts_nformats);
10621         ASSERT(state->dts_formats[format - 1] != NULL);
10622
10623         fmt = state->dts_formats[format - 1];
10624         kmem_free(fmt, strlen(fmt) + 1);
10625         state->dts_formats[format - 1] = NULL;
10626 }
10627
10628 static void
10629 dtrace_format_destroy(dtrace_state_t *state)
10630 {
10631         int i;
10632
10633         if (state->dts_nformats == 0) {
10634                 ASSERT(state->dts_formats == NULL);
10635                 return;
10636         }
10637
10638         ASSERT(state->dts_formats != NULL);
10639
10640         for (i = 0; i < state->dts_nformats; i++) {
10641                 char *fmt = state->dts_formats[i];
10642
10643                 if (fmt == NULL)
10644                         continue;
10645
10646                 kmem_free(fmt, strlen(fmt) + 1);
10647         }
10648
10649         kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10650         state->dts_nformats = 0;
10651         state->dts_formats = NULL;
10652 }
10653
10654 /*
10655  * DTrace Predicate Functions
10656  */
10657 static dtrace_predicate_t *
10658 dtrace_predicate_create(dtrace_difo_t *dp)
10659 {
10660         dtrace_predicate_t *pred;
10661
10662         ASSERT(MUTEX_HELD(&dtrace_lock));
10663         ASSERT(dp->dtdo_refcnt != 0);
10664
10665         pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10666         pred->dtp_difo = dp;
10667         pred->dtp_refcnt = 1;
10668
10669         if (!dtrace_difo_cacheable(dp))
10670                 return (pred);
10671
10672         if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10673                 /*
10674                  * This is only theoretically possible -- we have had 2^32
10675                  * cacheable predicates on this machine.  We cannot allow any
10676                  * more predicates to become cacheable:  as unlikely as it is,
10677                  * there may be a thread caching a (now stale) predicate cache
10678                  * ID. (N.B.: the temptation is being successfully resisted to
10679                  * have this cmn_err() "Holy shit -- we executed this code!")
10680                  */
10681                 return (pred);
10682         }
10683
10684         pred->dtp_cacheid = dtrace_predcache_id++;
10685
10686         return (pred);
10687 }
10688
10689 static void
10690 dtrace_predicate_hold(dtrace_predicate_t *pred)
10691 {
10692         ASSERT(MUTEX_HELD(&dtrace_lock));
10693         ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10694         ASSERT(pred->dtp_refcnt > 0);
10695
10696         pred->dtp_refcnt++;
10697 }
10698
10699 static void
10700 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10701 {
10702         dtrace_difo_t *dp = pred->dtp_difo;
10703
10704         ASSERT(MUTEX_HELD(&dtrace_lock));
10705         ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10706         ASSERT(pred->dtp_refcnt > 0);
10707
10708         if (--pred->dtp_refcnt == 0) {
10709                 dtrace_difo_release(pred->dtp_difo, vstate);
10710                 kmem_free(pred, sizeof (dtrace_predicate_t));
10711         }
10712 }
10713
10714 /*
10715  * DTrace Action Description Functions
10716  */
10717 static dtrace_actdesc_t *
10718 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10719     uint64_t uarg, uint64_t arg)
10720 {
10721         dtrace_actdesc_t *act;
10722
10723 #if defined(sun)
10724         ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
10725             arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
10726 #endif
10727
10728         act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10729         act->dtad_kind = kind;
10730         act->dtad_ntuple = ntuple;
10731         act->dtad_uarg = uarg;
10732         act->dtad_arg = arg;
10733         act->dtad_refcnt = 1;
10734
10735         return (act);
10736 }
10737
10738 static void
10739 dtrace_actdesc_hold(dtrace_actdesc_t *act)
10740 {
10741         ASSERT(act->dtad_refcnt >= 1);
10742         act->dtad_refcnt++;
10743 }
10744
10745 static void
10746 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10747 {
10748         dtrace_actkind_t kind = act->dtad_kind;
10749         dtrace_difo_t *dp;
10750
10751         ASSERT(act->dtad_refcnt >= 1);
10752
10753         if (--act->dtad_refcnt != 0)
10754                 return;
10755
10756         if ((dp = act->dtad_difo) != NULL)
10757                 dtrace_difo_release(dp, vstate);
10758
10759         if (DTRACEACT_ISPRINTFLIKE(kind)) {
10760                 char *str = (char *)(uintptr_t)act->dtad_arg;
10761
10762 #if defined(sun)
10763                 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10764                     (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10765 #endif
10766
10767                 if (str != NULL)
10768                         kmem_free(str, strlen(str) + 1);
10769         }
10770
10771         kmem_free(act, sizeof (dtrace_actdesc_t));
10772 }
10773
10774 /*
10775  * DTrace ECB Functions
10776  */
10777 static dtrace_ecb_t *
10778 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10779 {
10780         dtrace_ecb_t *ecb;
10781         dtrace_epid_t epid;
10782
10783         ASSERT(MUTEX_HELD(&dtrace_lock));
10784
10785         ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10786         ecb->dte_predicate = NULL;
10787         ecb->dte_probe = probe;
10788
10789         /*
10790          * The default size is the size of the default action: recording
10791          * the header.
10792          */
10793         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10794         ecb->dte_alignment = sizeof (dtrace_epid_t);
10795
10796         epid = state->dts_epid++;
10797
10798         if (epid - 1 >= state->dts_necbs) {
10799                 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10800                 int necbs = state->dts_necbs << 1;
10801
10802                 ASSERT(epid == state->dts_necbs + 1);
10803
10804                 if (necbs == 0) {
10805                         ASSERT(oecbs == NULL);
10806                         necbs = 1;
10807                 }
10808
10809                 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10810
10811                 if (oecbs != NULL)
10812                         bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10813
10814                 dtrace_membar_producer();
10815                 state->dts_ecbs = ecbs;
10816
10817                 if (oecbs != NULL) {
10818                         /*
10819                          * If this state is active, we must dtrace_sync()
10820                          * before we can free the old dts_ecbs array:  we're
10821                          * coming in hot, and there may be active ring
10822                          * buffer processing (which indexes into the dts_ecbs
10823                          * array) on another CPU.
10824                          */
10825                         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10826                                 dtrace_sync();
10827
10828                         kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10829                 }
10830
10831                 dtrace_membar_producer();
10832                 state->dts_necbs = necbs;
10833         }
10834
10835         ecb->dte_state = state;
10836
10837         ASSERT(state->dts_ecbs[epid - 1] == NULL);
10838         dtrace_membar_producer();
10839         state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10840
10841         return (ecb);
10842 }
10843
10844 static void
10845 dtrace_ecb_enable(dtrace_ecb_t *ecb)
10846 {
10847         dtrace_probe_t *probe = ecb->dte_probe;
10848
10849         ASSERT(MUTEX_HELD(&cpu_lock));
10850         ASSERT(MUTEX_HELD(&dtrace_lock));
10851         ASSERT(ecb->dte_next == NULL);
10852
10853         if (probe == NULL) {
10854                 /*
10855                  * This is the NULL probe -- there's nothing to do.
10856                  */
10857                 return;
10858         }
10859
10860         if (probe->dtpr_ecb == NULL) {
10861                 dtrace_provider_t *prov = probe->dtpr_provider;
10862
10863                 /*
10864                  * We're the first ECB on this probe.
10865                  */
10866                 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10867
10868                 if (ecb->dte_predicate != NULL)
10869                         probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10870
10871                 prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10872                     probe->dtpr_id, probe->dtpr_arg);
10873         } else {
10874                 /*
10875                  * This probe is already active.  Swing the last pointer to
10876                  * point to the new ECB, and issue a dtrace_sync() to assure
10877                  * that all CPUs have seen the change.
10878                  */
10879                 ASSERT(probe->dtpr_ecb_last != NULL);
10880                 probe->dtpr_ecb_last->dte_next = ecb;
10881                 probe->dtpr_ecb_last = ecb;
10882                 probe->dtpr_predcache = 0;
10883
10884                 dtrace_sync();
10885         }
10886 }
10887
10888 static void
10889 dtrace_ecb_resize(dtrace_ecb_t *ecb)
10890 {
10891         dtrace_action_t *act;
10892         uint32_t curneeded = UINT32_MAX;
10893         uint32_t aggbase = UINT32_MAX;
10894
10895         /*
10896          * If we record anything, we always record the dtrace_rechdr_t.  (And
10897          * we always record it first.)
10898          */
10899         ecb->dte_size = sizeof (dtrace_rechdr_t);
10900         ecb->dte_alignment = sizeof (dtrace_epid_t);
10901
10902         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10903                 dtrace_recdesc_t *rec = &act->dta_rec;
10904                 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10905
10906                 ecb->dte_alignment = MAX(ecb->dte_alignment,
10907                     rec->dtrd_alignment);
10908
10909                 if (DTRACEACT_ISAGG(act->dta_kind)) {
10910                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10911
10912                         ASSERT(rec->dtrd_size != 0);
10913                         ASSERT(agg->dtag_first != NULL);
10914                         ASSERT(act->dta_prev->dta_intuple);
10915                         ASSERT(aggbase != UINT32_MAX);
10916                         ASSERT(curneeded != UINT32_MAX);
10917
10918                         agg->dtag_base = aggbase;
10919
10920                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10921                         rec->dtrd_offset = curneeded;
10922                         curneeded += rec->dtrd_size;
10923                         ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10924
10925                         aggbase = UINT32_MAX;
10926                         curneeded = UINT32_MAX;
10927                 } else if (act->dta_intuple) {
10928                         if (curneeded == UINT32_MAX) {
10929                                 /*
10930                                  * This is the first record in a tuple.  Align
10931                                  * curneeded to be at offset 4 in an 8-byte
10932                                  * aligned block.
10933                                  */
10934                                 ASSERT(act->dta_prev == NULL ||
10935                                     !act->dta_prev->dta_intuple);
10936                                 ASSERT3U(aggbase, ==, UINT32_MAX);
10937                                 curneeded = P2PHASEUP(ecb->dte_size,
10938                                     sizeof (uint64_t), sizeof (dtrace_aggid_t));
10939
10940                                 aggbase = curneeded - sizeof (dtrace_aggid_t);
10941                                 ASSERT(IS_P2ALIGNED(aggbase,
10942                                     sizeof (uint64_t)));
10943                         }
10944                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10945                         rec->dtrd_offset = curneeded;
10946                         curneeded += rec->dtrd_size;
10947                 } else {
10948                         /* tuples must be followed by an aggregation */
10949                         ASSERT(act->dta_prev == NULL ||
10950                             !act->dta_prev->dta_intuple);
10951
10952                         ecb->dte_size = P2ROUNDUP(ecb->dte_size,
10953                             rec->dtrd_alignment);
10954                         rec->dtrd_offset = ecb->dte_size;
10955                         ecb->dte_size += rec->dtrd_size;
10956                         ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
10957                 }
10958         }
10959
10960         if ((act = ecb->dte_action) != NULL &&
10961             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10962             ecb->dte_size == sizeof (dtrace_rechdr_t)) {
10963                 /*
10964                  * If the size is still sizeof (dtrace_rechdr_t), then all
10965                  * actions store no data; set the size to 0.
10966                  */
10967                 ecb->dte_size = 0;
10968         }
10969
10970         ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
10971         ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
10972         ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
10973             ecb->dte_needed);
10974 }
10975
10976 static dtrace_action_t *
10977 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10978 {
10979         dtrace_aggregation_t *agg;
10980         size_t size = sizeof (uint64_t);
10981         int ntuple = desc->dtad_ntuple;
10982         dtrace_action_t *act;
10983         dtrace_recdesc_t *frec;
10984         dtrace_aggid_t aggid;
10985         dtrace_state_t *state = ecb->dte_state;
10986
10987         agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10988         agg->dtag_ecb = ecb;
10989
10990         ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
10991
10992         switch (desc->dtad_kind) {
10993         case DTRACEAGG_MIN:
10994                 agg->dtag_initial = INT64_MAX;
10995                 agg->dtag_aggregate = dtrace_aggregate_min;
10996                 break;
10997
10998         case DTRACEAGG_MAX:
10999                 agg->dtag_initial = INT64_MIN;
11000                 agg->dtag_aggregate = dtrace_aggregate_max;
11001                 break;
11002
11003         case DTRACEAGG_COUNT:
11004                 agg->dtag_aggregate = dtrace_aggregate_count;
11005                 break;
11006
11007         case DTRACEAGG_QUANTIZE:
11008                 agg->dtag_aggregate = dtrace_aggregate_quantize;
11009                 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11010                     sizeof (uint64_t);
11011                 break;
11012
11013         case DTRACEAGG_LQUANTIZE: {
11014                 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11015                 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11016
11017                 agg->dtag_initial = desc->dtad_arg;
11018                 agg->dtag_aggregate = dtrace_aggregate_lquantize;
11019
11020                 if (step == 0 || levels == 0)
11021                         goto err;
11022
11023                 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11024                 break;
11025         }
11026
11027         case DTRACEAGG_LLQUANTIZE: {
11028                 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11029                 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11030                 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11031                 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11032                 int64_t v;
11033
11034                 agg->dtag_initial = desc->dtad_arg;
11035                 agg->dtag_aggregate = dtrace_aggregate_llquantize;
11036
11037                 if (factor < 2 || low >= high || nsteps < factor)
11038                         goto err;
11039
11040                 /*
11041                  * Now check that the number of steps evenly divides a power
11042                  * of the factor.  (This assures both integer bucket size and
11043                  * linearity within each magnitude.)
11044                  */
11045                 for (v = factor; v < nsteps; v *= factor)
11046                         continue;
11047
11048                 if ((v % nsteps) || (nsteps % factor))
11049                         goto err;
11050
11051                 size = (dtrace_aggregate_llquantize_bucket(factor,
11052                     low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11053                 break;
11054         }
11055
11056         case DTRACEAGG_AVG:
11057                 agg->dtag_aggregate = dtrace_aggregate_avg;
11058                 size = sizeof (uint64_t) * 2;
11059                 break;
11060
11061         case DTRACEAGG_STDDEV:
11062                 agg->dtag_aggregate = dtrace_aggregate_stddev;
11063                 size = sizeof (uint64_t) * 4;
11064                 break;
11065
11066         case DTRACEAGG_SUM:
11067                 agg->dtag_aggregate = dtrace_aggregate_sum;
11068                 break;
11069
11070         default:
11071                 goto err;
11072         }
11073
11074         agg->dtag_action.dta_rec.dtrd_size = size;
11075
11076         if (ntuple == 0)
11077                 goto err;
11078
11079         /*
11080          * We must make sure that we have enough actions for the n-tuple.
11081          */
11082         for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11083                 if (DTRACEACT_ISAGG(act->dta_kind))
11084                         break;
11085
11086                 if (--ntuple == 0) {
11087                         /*
11088                          * This is the action with which our n-tuple begins.
11089                          */
11090                         agg->dtag_first = act;
11091                         goto success;
11092                 }
11093         }
11094
11095         /*
11096          * This n-tuple is short by ntuple elements.  Return failure.
11097          */
11098         ASSERT(ntuple != 0);
11099 err:
11100         kmem_free(agg, sizeof (dtrace_aggregation_t));
11101         return (NULL);
11102
11103 success:
11104         /*
11105          * If the last action in the tuple has a size of zero, it's actually
11106          * an expression argument for the aggregating action.
11107          */
11108         ASSERT(ecb->dte_action_last != NULL);
11109         act = ecb->dte_action_last;
11110
11111         if (act->dta_kind == DTRACEACT_DIFEXPR) {
11112                 ASSERT(act->dta_difo != NULL);
11113
11114                 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11115                         agg->dtag_hasarg = 1;
11116         }
11117
11118         /*
11119          * We need to allocate an id for this aggregation.
11120          */
11121 #if defined(sun)
11122         aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11123             VM_BESTFIT | VM_SLEEP);
11124 #else
11125         aggid = alloc_unr(state->dts_aggid_arena);
11126 #endif
11127
11128         if (aggid - 1 >= state->dts_naggregations) {
11129                 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11130                 dtrace_aggregation_t **aggs;
11131                 int naggs = state->dts_naggregations << 1;
11132                 int onaggs = state->dts_naggregations;
11133
11134                 ASSERT(aggid == state->dts_naggregations + 1);
11135
11136                 if (naggs == 0) {
11137                         ASSERT(oaggs == NULL);
11138                         naggs = 1;
11139                 }
11140
11141                 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11142
11143                 if (oaggs != NULL) {
11144                         bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11145                         kmem_free(oaggs, onaggs * sizeof (*aggs));
11146                 }
11147
11148                 state->dts_aggregations = aggs;
11149                 state->dts_naggregations = naggs;
11150         }
11151
11152         ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11153         state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11154
11155         frec = &agg->dtag_first->dta_rec;
11156         if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11157                 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11158
11159         for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11160                 ASSERT(!act->dta_intuple);
11161                 act->dta_intuple = 1;
11162         }
11163
11164         return (&agg->dtag_action);
11165 }
11166
11167 static void
11168 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11169 {
11170         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11171         dtrace_state_t *state = ecb->dte_state;
11172         dtrace_aggid_t aggid = agg->dtag_id;
11173
11174         ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11175 #if defined(sun)
11176         vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11177 #else
11178         free_unr(state->dts_aggid_arena, aggid);
11179 #endif
11180
11181         ASSERT(state->dts_aggregations[aggid - 1] == agg);
11182         state->dts_aggregations[aggid - 1] = NULL;
11183
11184         kmem_free(agg, sizeof (dtrace_aggregation_t));
11185 }
11186
11187 static int
11188 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11189 {
11190         dtrace_action_t *action, *last;
11191         dtrace_difo_t *dp = desc->dtad_difo;
11192         uint32_t size = 0, align = sizeof (uint8_t), mask;
11193         uint16_t format = 0;
11194         dtrace_recdesc_t *rec;
11195         dtrace_state_t *state = ecb->dte_state;
11196         dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
11197         uint64_t arg = desc->dtad_arg;
11198
11199         ASSERT(MUTEX_HELD(&dtrace_lock));
11200         ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11201
11202         if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11203                 /*
11204                  * If this is an aggregating action, there must be neither
11205                  * a speculate nor a commit on the action chain.
11206                  */
11207                 dtrace_action_t *act;
11208
11209                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11210                         if (act->dta_kind == DTRACEACT_COMMIT)
11211                                 return (EINVAL);
11212
11213                         if (act->dta_kind == DTRACEACT_SPECULATE)
11214                                 return (EINVAL);
11215                 }
11216
11217                 action = dtrace_ecb_aggregation_create(ecb, desc);
11218
11219                 if (action == NULL)
11220                         return (EINVAL);
11221         } else {
11222                 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11223                     (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11224                     dp != NULL && dp->dtdo_destructive)) {
11225                         state->dts_destructive = 1;
11226                 }
11227
11228                 switch (desc->dtad_kind) {
11229                 case DTRACEACT_PRINTF:
11230                 case DTRACEACT_PRINTA:
11231                 case DTRACEACT_SYSTEM:
11232                 case DTRACEACT_FREOPEN:
11233                 case DTRACEACT_DIFEXPR:
11234                         /*
11235                          * We know that our arg is a string -- turn it into a
11236                          * format.
11237                          */
11238                         if (arg == 0) {
11239                                 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11240                                     desc->dtad_kind == DTRACEACT_DIFEXPR);
11241                                 format = 0;
11242                         } else {
11243                                 ASSERT(arg != 0);
11244 #if defined(sun)
11245                                 ASSERT(arg > KERNELBASE);
11246 #endif
11247                                 format = dtrace_format_add(state,
11248                                     (char *)(uintptr_t)arg);
11249                         }
11250
11251                         /*FALLTHROUGH*/
11252                 case DTRACEACT_LIBACT:
11253                 case DTRACEACT_TRACEMEM:
11254                 case DTRACEACT_TRACEMEM_DYNSIZE:
11255                         if (dp == NULL)
11256                                 return (EINVAL);
11257
11258                         if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11259                                 break;
11260
11261                         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11262                                 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11263                                         return (EINVAL);
11264
11265                                 size = opt[DTRACEOPT_STRSIZE];
11266                         }
11267
11268                         break;
11269
11270                 case DTRACEACT_STACK:
11271                         if ((nframes = arg) == 0) {
11272                                 nframes = opt[DTRACEOPT_STACKFRAMES];
11273                                 ASSERT(nframes > 0);
11274                                 arg = nframes;
11275                         }
11276
11277                         size = nframes * sizeof (pc_t);
11278                         break;
11279
11280                 case DTRACEACT_JSTACK:
11281                         if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11282                                 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11283
11284                         if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11285                                 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11286
11287                         arg = DTRACE_USTACK_ARG(nframes, strsize);
11288
11289                         /*FALLTHROUGH*/
11290                 case DTRACEACT_USTACK:
11291                         if (desc->dtad_kind != DTRACEACT_JSTACK &&
11292                             (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11293                                 strsize = DTRACE_USTACK_STRSIZE(arg);
11294                                 nframes = opt[DTRACEOPT_USTACKFRAMES];
11295                                 ASSERT(nframes > 0);
11296                                 arg = DTRACE_USTACK_ARG(nframes, strsize);
11297                         }
11298
11299                         /*
11300                          * Save a slot for the pid.
11301                          */
11302                         size = (nframes + 1) * sizeof (uint64_t);
11303                         size += DTRACE_USTACK_STRSIZE(arg);
11304                         size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11305
11306                         break;
11307
11308                 case DTRACEACT_SYM:
11309                 case DTRACEACT_MOD:
11310                         if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11311                             sizeof (uint64_t)) ||
11312                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11313                                 return (EINVAL);
11314                         break;
11315
11316                 case DTRACEACT_USYM:
11317                 case DTRACEACT_UMOD:
11318                 case DTRACEACT_UADDR:
11319                         if (dp == NULL ||
11320                             (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11321                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11322                                 return (EINVAL);
11323
11324                         /*
11325                          * We have a slot for the pid, plus a slot for the
11326                          * argument.  To keep things simple (aligned with
11327                          * bitness-neutral sizing), we store each as a 64-bit
11328                          * quantity.
11329                          */
11330                         size = 2 * sizeof (uint64_t);
11331                         break;
11332
11333                 case DTRACEACT_STOP:
11334                 case DTRACEACT_BREAKPOINT:
11335                 case DTRACEACT_PANIC:
11336                         break;
11337
11338                 case DTRACEACT_CHILL:
11339                 case DTRACEACT_DISCARD:
11340                 case DTRACEACT_RAISE:
11341                         if (dp == NULL)
11342                                 return (EINVAL);
11343                         break;
11344
11345                 case DTRACEACT_EXIT:
11346                         if (dp == NULL ||
11347                             (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11348                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11349                                 return (EINVAL);
11350                         break;
11351
11352                 case DTRACEACT_SPECULATE:
11353                         if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11354                                 return (EINVAL);
11355
11356                         if (dp == NULL)
11357                                 return (EINVAL);
11358
11359                         state->dts_speculates = 1;
11360                         break;
11361
11362                 case DTRACEACT_PRINTM:
11363                         size = dp->dtdo_rtype.dtdt_size;
11364                         break;
11365
11366                 case DTRACEACT_PRINTT:
11367                         size = dp->dtdo_rtype.dtdt_size;
11368                         break;
11369
11370                 case DTRACEACT_COMMIT: {
11371                         dtrace_action_t *act = ecb->dte_action;
11372
11373                         for (; act != NULL; act = act->dta_next) {
11374                                 if (act->dta_kind == DTRACEACT_COMMIT)
11375                                         return (EINVAL);
11376                         }
11377
11378                         if (dp == NULL)
11379                                 return (EINVAL);
11380                         break;
11381                 }
11382
11383                 default:
11384                         return (EINVAL);
11385                 }
11386
11387                 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11388                         /*
11389                          * If this is a data-storing action or a speculate,
11390                          * we must be sure that there isn't a commit on the
11391                          * action chain.
11392                          */
11393                         dtrace_action_t *act = ecb->dte_action;
11394
11395                         for (; act != NULL; act = act->dta_next) {
11396                                 if (act->dta_kind == DTRACEACT_COMMIT)
11397                                         return (EINVAL);
11398                         }
11399                 }
11400
11401                 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11402                 action->dta_rec.dtrd_size = size;
11403         }
11404
11405         action->dta_refcnt = 1;
11406         rec = &action->dta_rec;
11407         size = rec->dtrd_size;
11408
11409         for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11410                 if (!(size & mask)) {
11411                         align = mask + 1;
11412                         break;
11413                 }
11414         }
11415
11416         action->dta_kind = desc->dtad_kind;
11417
11418         if ((action->dta_difo = dp) != NULL)
11419                 dtrace_difo_hold(dp);
11420
11421         rec->dtrd_action = action->dta_kind;
11422         rec->dtrd_arg = arg;
11423         rec->dtrd_uarg = desc->dtad_uarg;
11424         rec->dtrd_alignment = (uint16_t)align;
11425         rec->dtrd_format = format;
11426
11427         if ((last = ecb->dte_action_last) != NULL) {
11428                 ASSERT(ecb->dte_action != NULL);
11429                 action->dta_prev = last;
11430                 last->dta_next = action;
11431         } else {
11432                 ASSERT(ecb->dte_action == NULL);
11433                 ecb->dte_action = action;
11434         }
11435
11436         ecb->dte_action_last = action;
11437
11438         return (0);
11439 }
11440
11441 static void
11442 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11443 {
11444         dtrace_action_t *act = ecb->dte_action, *next;
11445         dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11446         dtrace_difo_t *dp;
11447         uint16_t format;
11448
11449         if (act != NULL && act->dta_refcnt > 1) {
11450                 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11451                 act->dta_refcnt--;
11452         } else {
11453                 for (; act != NULL; act = next) {
11454                         next = act->dta_next;
11455                         ASSERT(next != NULL || act == ecb->dte_action_last);
11456                         ASSERT(act->dta_refcnt == 1);
11457
11458                         if ((format = act->dta_rec.dtrd_format) != 0)
11459                                 dtrace_format_remove(ecb->dte_state, format);
11460
11461                         if ((dp = act->dta_difo) != NULL)
11462                                 dtrace_difo_release(dp, vstate);
11463
11464                         if (DTRACEACT_ISAGG(act->dta_kind)) {
11465                                 dtrace_ecb_aggregation_destroy(ecb, act);
11466                         } else {
11467                                 kmem_free(act, sizeof (dtrace_action_t));
11468                         }
11469                 }
11470         }
11471
11472         ecb->dte_action = NULL;
11473         ecb->dte_action_last = NULL;
11474         ecb->dte_size = 0;
11475 }
11476
11477 static void
11478 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11479 {
11480         /*
11481          * We disable the ECB by removing it from its probe.
11482          */
11483         dtrace_ecb_t *pecb, *prev = NULL;
11484         dtrace_probe_t *probe = ecb->dte_probe;
11485
11486         ASSERT(MUTEX_HELD(&dtrace_lock));
11487
11488         if (probe == NULL) {
11489                 /*
11490                  * This is the NULL probe; there is nothing to disable.
11491                  */
11492                 return;
11493         }
11494
11495         for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11496                 if (pecb == ecb)
11497                         break;
11498                 prev = pecb;
11499         }
11500
11501         ASSERT(pecb != NULL);
11502
11503         if (prev == NULL) {
11504                 probe->dtpr_ecb = ecb->dte_next;
11505         } else {
11506                 prev->dte_next = ecb->dte_next;
11507         }
11508
11509         if (ecb == probe->dtpr_ecb_last) {
11510                 ASSERT(ecb->dte_next == NULL);
11511                 probe->dtpr_ecb_last = prev;
11512         }
11513
11514         /*
11515          * The ECB has been disconnected from the probe; now sync to assure
11516          * that all CPUs have seen the change before returning.
11517          */
11518         dtrace_sync();
11519
11520         if (probe->dtpr_ecb == NULL) {
11521                 /*
11522                  * That was the last ECB on the probe; clear the predicate
11523                  * cache ID for the probe, disable it and sync one more time
11524                  * to assure that we'll never hit it again.
11525                  */
11526                 dtrace_provider_t *prov = probe->dtpr_provider;
11527
11528                 ASSERT(ecb->dte_next == NULL);
11529                 ASSERT(probe->dtpr_ecb_last == NULL);
11530                 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11531                 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11532                     probe->dtpr_id, probe->dtpr_arg);
11533                 dtrace_sync();
11534         } else {
11535                 /*
11536                  * There is at least one ECB remaining on the probe.  If there
11537                  * is _exactly_ one, set the probe's predicate cache ID to be
11538                  * the predicate cache ID of the remaining ECB.
11539                  */
11540                 ASSERT(probe->dtpr_ecb_last != NULL);
11541                 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11542
11543                 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11544                         dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11545
11546                         ASSERT(probe->dtpr_ecb->dte_next == NULL);
11547
11548                         if (p != NULL)
11549                                 probe->dtpr_predcache = p->dtp_cacheid;
11550                 }
11551
11552                 ecb->dte_next = NULL;
11553         }
11554 }
11555
11556 static void
11557 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11558 {
11559         dtrace_state_t *state = ecb->dte_state;
11560         dtrace_vstate_t *vstate = &state->dts_vstate;
11561         dtrace_predicate_t *pred;
11562         dtrace_epid_t epid = ecb->dte_epid;
11563
11564         ASSERT(MUTEX_HELD(&dtrace_lock));
11565         ASSERT(ecb->dte_next == NULL);
11566         ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11567
11568         if ((pred = ecb->dte_predicate) != NULL)
11569                 dtrace_predicate_release(pred, vstate);
11570
11571         dtrace_ecb_action_remove(ecb);
11572
11573         ASSERT(state->dts_ecbs[epid - 1] == ecb);
11574         state->dts_ecbs[epid - 1] = NULL;
11575
11576         kmem_free(ecb, sizeof (dtrace_ecb_t));
11577 }
11578
11579 static dtrace_ecb_t *
11580 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11581     dtrace_enabling_t *enab)
11582 {
11583         dtrace_ecb_t *ecb;
11584         dtrace_predicate_t *pred;
11585         dtrace_actdesc_t *act;
11586         dtrace_provider_t *prov;
11587         dtrace_ecbdesc_t *desc = enab->dten_current;
11588
11589         ASSERT(MUTEX_HELD(&dtrace_lock));
11590         ASSERT(state != NULL);
11591
11592         ecb = dtrace_ecb_add(state, probe);
11593         ecb->dte_uarg = desc->dted_uarg;
11594
11595         if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11596                 dtrace_predicate_hold(pred);
11597                 ecb->dte_predicate = pred;
11598         }
11599
11600         if (probe != NULL) {
11601                 /*
11602                  * If the provider shows more leg than the consumer is old
11603                  * enough to see, we need to enable the appropriate implicit
11604                  * predicate bits to prevent the ecb from activating at
11605                  * revealing times.
11606                  *
11607                  * Providers specifying DTRACE_PRIV_USER at register time
11608                  * are stating that they need the /proc-style privilege
11609                  * model to be enforced, and this is what DTRACE_COND_OWNER
11610                  * and DTRACE_COND_ZONEOWNER will then do at probe time.
11611                  */
11612                 prov = probe->dtpr_provider;
11613                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11614                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11615                         ecb->dte_cond |= DTRACE_COND_OWNER;
11616
11617                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11618                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11619                         ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11620
11621                 /*
11622                  * If the provider shows us kernel innards and the user
11623                  * is lacking sufficient privilege, enable the
11624                  * DTRACE_COND_USERMODE implicit predicate.
11625                  */
11626                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11627                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11628                         ecb->dte_cond |= DTRACE_COND_USERMODE;
11629         }
11630
11631         if (dtrace_ecb_create_cache != NULL) {
11632                 /*
11633                  * If we have a cached ecb, we'll use its action list instead
11634                  * of creating our own (saving both time and space).
11635                  */
11636                 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11637                 dtrace_action_t *act = cached->dte_action;
11638
11639                 if (act != NULL) {
11640                         ASSERT(act->dta_refcnt > 0);
11641                         act->dta_refcnt++;
11642                         ecb->dte_action = act;
11643                         ecb->dte_action_last = cached->dte_action_last;
11644                         ecb->dte_needed = cached->dte_needed;
11645                         ecb->dte_size = cached->dte_size;
11646                         ecb->dte_alignment = cached->dte_alignment;
11647                 }
11648
11649                 return (ecb);
11650         }
11651
11652         for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11653                 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11654                         dtrace_ecb_destroy(ecb);
11655                         return (NULL);
11656                 }
11657         }
11658
11659         dtrace_ecb_resize(ecb);
11660
11661         return (dtrace_ecb_create_cache = ecb);
11662 }
11663
11664 static int
11665 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
11666 {
11667         dtrace_ecb_t *ecb;
11668         dtrace_enabling_t *enab = arg;
11669         dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11670
11671         ASSERT(state != NULL);
11672
11673         if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
11674                 /*
11675                  * This probe was created in a generation for which this
11676                  * enabling has previously created ECBs; we don't want to
11677                  * enable it again, so just kick out.
11678                  */
11679                 return (DTRACE_MATCH_NEXT);
11680         }
11681
11682         if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11683                 return (DTRACE_MATCH_DONE);
11684
11685         dtrace_ecb_enable(ecb);
11686         return (DTRACE_MATCH_NEXT);
11687 }
11688
11689 static dtrace_ecb_t *
11690 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11691 {
11692         dtrace_ecb_t *ecb;
11693
11694         ASSERT(MUTEX_HELD(&dtrace_lock));
11695
11696         if (id == 0 || id > state->dts_necbs)
11697                 return (NULL);
11698
11699         ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11700         ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11701
11702         return (state->dts_ecbs[id - 1]);
11703 }
11704
11705 static dtrace_aggregation_t *
11706 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11707 {
11708         dtrace_aggregation_t *agg;
11709
11710         ASSERT(MUTEX_HELD(&dtrace_lock));
11711
11712         if (id == 0 || id > state->dts_naggregations)
11713                 return (NULL);
11714
11715         ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11716         ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11717             agg->dtag_id == id);
11718
11719         return (state->dts_aggregations[id - 1]);
11720 }
11721
11722 /*
11723  * DTrace Buffer Functions
11724  *
11725  * The following functions manipulate DTrace buffers.  Most of these functions
11726  * are called in the context of establishing or processing consumer state;
11727  * exceptions are explicitly noted.
11728  */
11729
11730 /*
11731  * Note:  called from cross call context.  This function switches the two
11732  * buffers on a given CPU.  The atomicity of this operation is assured by
11733  * disabling interrupts while the actual switch takes place; the disabling of
11734  * interrupts serializes the execution with any execution of dtrace_probe() on
11735  * the same CPU.
11736  */
11737 static void
11738 dtrace_buffer_switch(dtrace_buffer_t *buf)
11739 {
11740         caddr_t tomax = buf->dtb_tomax;
11741         caddr_t xamot = buf->dtb_xamot;
11742         dtrace_icookie_t cookie;
11743         hrtime_t now;
11744
11745         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11746         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11747
11748         cookie = dtrace_interrupt_disable();
11749         now = dtrace_gethrtime();
11750         buf->dtb_tomax = xamot;
11751         buf->dtb_xamot = tomax;
11752         buf->dtb_xamot_drops = buf->dtb_drops;
11753         buf->dtb_xamot_offset = buf->dtb_offset;
11754         buf->dtb_xamot_errors = buf->dtb_errors;
11755         buf->dtb_xamot_flags = buf->dtb_flags;
11756         buf->dtb_offset = 0;
11757         buf->dtb_drops = 0;
11758         buf->dtb_errors = 0;
11759         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11760         buf->dtb_interval = now - buf->dtb_switched;
11761         buf->dtb_switched = now;
11762         dtrace_interrupt_enable(cookie);
11763 }
11764
11765 /*
11766  * Note:  called from cross call context.  This function activates a buffer
11767  * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11768  * is guaranteed by the disabling of interrupts.
11769  */
11770 static void
11771 dtrace_buffer_activate(dtrace_state_t *state)
11772 {
11773         dtrace_buffer_t *buf;
11774         dtrace_icookie_t cookie = dtrace_interrupt_disable();
11775
11776         buf = &state->dts_buffer[curcpu];
11777
11778         if (buf->dtb_tomax != NULL) {
11779                 /*
11780                  * We might like to assert that the buffer is marked inactive,
11781                  * but this isn't necessarily true:  the buffer for the CPU
11782                  * that processes the BEGIN probe has its buffer activated
11783                  * manually.  In this case, we take the (harmless) action
11784                  * re-clearing the bit INACTIVE bit.
11785                  */
11786                 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11787         }
11788
11789         dtrace_interrupt_enable(cookie);
11790 }
11791
11792 static int
11793 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
11794     processorid_t cpu, int *factor)
11795 {
11796 #if defined(sun)
11797         cpu_t *cp;
11798 #endif
11799         dtrace_buffer_t *buf;
11800         int allocated = 0, desired = 0;
11801
11802 #if defined(sun)
11803         ASSERT(MUTEX_HELD(&cpu_lock));
11804         ASSERT(MUTEX_HELD(&dtrace_lock));
11805
11806         *factor = 1;
11807
11808         if (size > dtrace_nonroot_maxsize &&
11809             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11810                 return (EFBIG);
11811
11812         cp = cpu_list;
11813
11814         do {
11815                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11816                         continue;
11817
11818                 buf = &bufs[cp->cpu_id];
11819
11820                 /*
11821                  * If there is already a buffer allocated for this CPU, it
11822                  * is only possible that this is a DR event.  In this case,
11823                  */
11824                 if (buf->dtb_tomax != NULL) {
11825                         ASSERT(buf->dtb_size == size);
11826                         continue;
11827                 }
11828
11829                 ASSERT(buf->dtb_xamot == NULL);
11830
11831                 if ((buf->dtb_tomax = kmem_zalloc(size,
11832                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11833                         goto err;
11834
11835                 buf->dtb_size = size;
11836                 buf->dtb_flags = flags;
11837                 buf->dtb_offset = 0;
11838                 buf->dtb_drops = 0;
11839
11840                 if (flags & DTRACEBUF_NOSWITCH)
11841                         continue;
11842
11843                 if ((buf->dtb_xamot = kmem_zalloc(size,
11844                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11845                         goto err;
11846         } while ((cp = cp->cpu_next) != cpu_list);
11847
11848         return (0);
11849
11850 err:
11851         cp = cpu_list;
11852
11853         do {
11854                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11855                         continue;
11856
11857                 buf = &bufs[cp->cpu_id];
11858                 desired += 2;
11859
11860                 if (buf->dtb_xamot != NULL) {
11861                         ASSERT(buf->dtb_tomax != NULL);
11862                         ASSERT(buf->dtb_size == size);
11863                         kmem_free(buf->dtb_xamot, size);
11864                         allocated++;
11865                 }
11866
11867                 if (buf->dtb_tomax != NULL) {
11868                         ASSERT(buf->dtb_size == size);
11869                         kmem_free(buf->dtb_tomax, size);
11870                         allocated++;
11871                 }
11872
11873                 buf->dtb_tomax = NULL;
11874                 buf->dtb_xamot = NULL;
11875                 buf->dtb_size = 0;
11876         } while ((cp = cp->cpu_next) != cpu_list);
11877 #else
11878         int i;
11879
11880         *factor = 1;
11881 #if defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
11882         /*
11883          * FreeBSD isn't good at limiting the amount of memory we
11884          * ask to malloc, so let's place a limit here before trying
11885          * to do something that might well end in tears at bedtime.
11886          */
11887         if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
11888                 return (ENOMEM);
11889 #endif
11890
11891         ASSERT(MUTEX_HELD(&dtrace_lock));
11892         CPU_FOREACH(i) {
11893                 if (cpu != DTRACE_CPUALL && cpu != i)
11894                         continue;
11895
11896                 buf = &bufs[i];
11897
11898                 /*
11899                  * If there is already a buffer allocated for this CPU, it
11900                  * is only possible that this is a DR event.  In this case,
11901                  * the buffer size must match our specified size.
11902                  */
11903                 if (buf->dtb_tomax != NULL) {
11904                         ASSERT(buf->dtb_size == size);
11905                         continue;
11906                 }
11907
11908                 ASSERT(buf->dtb_xamot == NULL);
11909
11910                 if ((buf->dtb_tomax = kmem_zalloc(size,
11911                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11912                         goto err;
11913
11914                 buf->dtb_size = size;
11915                 buf->dtb_flags = flags;
11916                 buf->dtb_offset = 0;
11917                 buf->dtb_drops = 0;
11918
11919                 if (flags & DTRACEBUF_NOSWITCH)
11920                         continue;
11921
11922                 if ((buf->dtb_xamot = kmem_zalloc(size,
11923                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11924                         goto err;
11925         }
11926
11927         return (0);
11928
11929 err:
11930         /*
11931          * Error allocating memory, so free the buffers that were
11932          * allocated before the failed allocation.
11933          */
11934         CPU_FOREACH(i) {
11935                 if (cpu != DTRACE_CPUALL && cpu != i)
11936                         continue;
11937
11938                 buf = &bufs[i];
11939                 desired += 2;
11940
11941                 if (buf->dtb_xamot != NULL) {
11942                         ASSERT(buf->dtb_tomax != NULL);
11943                         ASSERT(buf->dtb_size == size);
11944                         kmem_free(buf->dtb_xamot, size);
11945                         allocated++;
11946                 }
11947
11948                 if (buf->dtb_tomax != NULL) {
11949                         ASSERT(buf->dtb_size == size);
11950                         kmem_free(buf->dtb_tomax, size);
11951                         allocated++;
11952                 }
11953
11954                 buf->dtb_tomax = NULL;
11955                 buf->dtb_xamot = NULL;
11956                 buf->dtb_size = 0;
11957
11958         }
11959 #endif
11960         *factor = desired / (allocated > 0 ? allocated : 1);
11961
11962         return (ENOMEM);
11963 }
11964
11965 /*
11966  * Note:  called from probe context.  This function just increments the drop
11967  * count on a buffer.  It has been made a function to allow for the
11968  * possibility of understanding the source of mysterious drop counts.  (A
11969  * problem for which one may be particularly disappointed that DTrace cannot
11970  * be used to understand DTrace.)
11971  */
11972 static void
11973 dtrace_buffer_drop(dtrace_buffer_t *buf)
11974 {
11975         buf->dtb_drops++;
11976 }
11977
11978 /*
11979  * Note:  called from probe context.  This function is called to reserve space
11980  * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
11981  * mstate.  Returns the new offset in the buffer, or a negative value if an
11982  * error has occurred.
11983  */
11984 static intptr_t
11985 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11986     dtrace_state_t *state, dtrace_mstate_t *mstate)
11987 {
11988         intptr_t offs = buf->dtb_offset, soffs;
11989         intptr_t woffs;
11990         caddr_t tomax;
11991         size_t total;
11992
11993         if (buf->dtb_flags & DTRACEBUF_INACTIVE)
11994                 return (-1);
11995
11996         if ((tomax = buf->dtb_tomax) == NULL) {
11997                 dtrace_buffer_drop(buf);
11998                 return (-1);
11999         }
12000
12001         if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12002                 while (offs & (align - 1)) {
12003                         /*
12004                          * Assert that our alignment is off by a number which
12005                          * is itself sizeof (uint32_t) aligned.
12006                          */
12007                         ASSERT(!((align - (offs & (align - 1))) &
12008                             (sizeof (uint32_t) - 1)));
12009                         DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12010                         offs += sizeof (uint32_t);
12011                 }
12012
12013                 if ((soffs = offs + needed) > buf->dtb_size) {
12014                         dtrace_buffer_drop(buf);
12015                         return (-1);
12016                 }
12017
12018                 if (mstate == NULL)
12019                         return (offs);
12020
12021                 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12022                 mstate->dtms_scratch_size = buf->dtb_size - soffs;
12023                 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12024
12025                 return (offs);
12026         }
12027
12028         if (buf->dtb_flags & DTRACEBUF_FILL) {
12029                 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12030                     (buf->dtb_flags & DTRACEBUF_FULL))
12031                         return (-1);
12032                 goto out;
12033         }
12034
12035         total = needed + (offs & (align - 1));
12036
12037         /*
12038          * For a ring buffer, life is quite a bit more complicated.  Before
12039          * we can store any padding, we need to adjust our wrapping offset.
12040          * (If we've never before wrapped or we're not about to, no adjustment
12041          * is required.)
12042          */
12043         if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12044             offs + total > buf->dtb_size) {
12045                 woffs = buf->dtb_xamot_offset;
12046
12047                 if (offs + total > buf->dtb_size) {
12048                         /*
12049                          * We can't fit in the end of the buffer.  First, a
12050                          * sanity check that we can fit in the buffer at all.
12051                          */
12052                         if (total > buf->dtb_size) {
12053                                 dtrace_buffer_drop(buf);
12054                                 return (-1);
12055                         }
12056
12057                         /*
12058                          * We're going to be storing at the top of the buffer,
12059                          * so now we need to deal with the wrapped offset.  We
12060                          * only reset our wrapped offset to 0 if it is
12061                          * currently greater than the current offset.  If it
12062                          * is less than the current offset, it is because a
12063                          * previous allocation induced a wrap -- but the
12064                          * allocation didn't subsequently take the space due
12065                          * to an error or false predicate evaluation.  In this
12066                          * case, we'll just leave the wrapped offset alone: if
12067                          * the wrapped offset hasn't been advanced far enough
12068                          * for this allocation, it will be adjusted in the
12069                          * lower loop.
12070                          */
12071                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12072                                 if (woffs >= offs)
12073                                         woffs = 0;
12074                         } else {
12075                                 woffs = 0;
12076                         }
12077
12078                         /*
12079                          * Now we know that we're going to be storing to the
12080                          * top of the buffer and that there is room for us
12081                          * there.  We need to clear the buffer from the current
12082                          * offset to the end (there may be old gunk there).
12083                          */
12084                         while (offs < buf->dtb_size)
12085                                 tomax[offs++] = 0;
12086
12087                         /*
12088                          * We need to set our offset to zero.  And because we
12089                          * are wrapping, we need to set the bit indicating as
12090                          * much.  We can also adjust our needed space back
12091                          * down to the space required by the ECB -- we know
12092                          * that the top of the buffer is aligned.
12093                          */
12094                         offs = 0;
12095                         total = needed;
12096                         buf->dtb_flags |= DTRACEBUF_WRAPPED;
12097                 } else {
12098                         /*
12099                          * There is room for us in the buffer, so we simply
12100                          * need to check the wrapped offset.
12101                          */
12102                         if (woffs < offs) {
12103                                 /*
12104                                  * The wrapped offset is less than the offset.
12105                                  * This can happen if we allocated buffer space
12106                                  * that induced a wrap, but then we didn't
12107                                  * subsequently take the space due to an error
12108                                  * or false predicate evaluation.  This is
12109                                  * okay; we know that _this_ allocation isn't
12110                                  * going to induce a wrap.  We still can't
12111                                  * reset the wrapped offset to be zero,
12112                                  * however: the space may have been trashed in
12113                                  * the previous failed probe attempt.  But at
12114                                  * least the wrapped offset doesn't need to
12115                                  * be adjusted at all...
12116                                  */
12117                                 goto out;
12118                         }
12119                 }
12120
12121                 while (offs + total > woffs) {
12122                         dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12123                         size_t size;
12124
12125                         if (epid == DTRACE_EPIDNONE) {
12126                                 size = sizeof (uint32_t);
12127                         } else {
12128                                 ASSERT3U(epid, <=, state->dts_necbs);
12129                                 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12130
12131                                 size = state->dts_ecbs[epid - 1]->dte_size;
12132                         }
12133
12134                         ASSERT(woffs + size <= buf->dtb_size);
12135                         ASSERT(size != 0);
12136
12137                         if (woffs + size == buf->dtb_size) {
12138                                 /*
12139                                  * We've reached the end of the buffer; we want
12140                                  * to set the wrapped offset to 0 and break
12141                                  * out.  However, if the offs is 0, then we're
12142                                  * in a strange edge-condition:  the amount of
12143                                  * space that we want to reserve plus the size
12144                                  * of the record that we're overwriting is
12145                                  * greater than the size of the buffer.  This
12146                                  * is problematic because if we reserve the
12147                                  * space but subsequently don't consume it (due
12148                                  * to a failed predicate or error) the wrapped
12149                                  * offset will be 0 -- yet the EPID at offset 0
12150                                  * will not be committed.  This situation is
12151                                  * relatively easy to deal with:  if we're in
12152                                  * this case, the buffer is indistinguishable
12153                                  * from one that hasn't wrapped; we need only
12154                                  * finish the job by clearing the wrapped bit,
12155                                  * explicitly setting the offset to be 0, and
12156                                  * zero'ing out the old data in the buffer.
12157                                  */
12158                                 if (offs == 0) {
12159                                         buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12160                                         buf->dtb_offset = 0;
12161                                         woffs = total;
12162
12163                                         while (woffs < buf->dtb_size)
12164                                                 tomax[woffs++] = 0;
12165                                 }
12166
12167                                 woffs = 0;
12168                                 break;
12169                         }
12170
12171                         woffs += size;
12172                 }
12173
12174                 /*
12175                  * We have a wrapped offset.  It may be that the wrapped offset
12176                  * has become zero -- that's okay.
12177                  */
12178                 buf->dtb_xamot_offset = woffs;
12179         }
12180
12181 out:
12182         /*
12183          * Now we can plow the buffer with any necessary padding.
12184          */
12185         while (offs & (align - 1)) {
12186                 /*
12187                  * Assert that our alignment is off by a number which
12188                  * is itself sizeof (uint32_t) aligned.
12189                  */
12190                 ASSERT(!((align - (offs & (align - 1))) &
12191                     (sizeof (uint32_t) - 1)));
12192                 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12193                 offs += sizeof (uint32_t);
12194         }
12195
12196         if (buf->dtb_flags & DTRACEBUF_FILL) {
12197                 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12198                         buf->dtb_flags |= DTRACEBUF_FULL;
12199                         return (-1);
12200                 }
12201         }
12202
12203         if (mstate == NULL)
12204                 return (offs);
12205
12206         /*
12207          * For ring buffers and fill buffers, the scratch space is always
12208          * the inactive buffer.
12209          */
12210         mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12211         mstate->dtms_scratch_size = buf->dtb_size;
12212         mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12213
12214         return (offs);
12215 }
12216
12217 static void
12218 dtrace_buffer_polish(dtrace_buffer_t *buf)
12219 {
12220         ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12221         ASSERT(MUTEX_HELD(&dtrace_lock));
12222
12223         if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12224                 return;
12225
12226         /*
12227          * We need to polish the ring buffer.  There are three cases:
12228          *
12229          * - The first (and presumably most common) is that there is no gap
12230          *   between the buffer offset and the wrapped offset.  In this case,
12231          *   there is nothing in the buffer that isn't valid data; we can
12232          *   mark the buffer as polished and return.
12233          *
12234          * - The second (less common than the first but still more common
12235          *   than the third) is that there is a gap between the buffer offset
12236          *   and the wrapped offset, and the wrapped offset is larger than the
12237          *   buffer offset.  This can happen because of an alignment issue, or
12238          *   can happen because of a call to dtrace_buffer_reserve() that
12239          *   didn't subsequently consume the buffer space.  In this case,
12240          *   we need to zero the data from the buffer offset to the wrapped
12241          *   offset.
12242          *
12243          * - The third (and least common) is that there is a gap between the
12244          *   buffer offset and the wrapped offset, but the wrapped offset is
12245          *   _less_ than the buffer offset.  This can only happen because a
12246          *   call to dtrace_buffer_reserve() induced a wrap, but the space
12247          *   was not subsequently consumed.  In this case, we need to zero the
12248          *   space from the offset to the end of the buffer _and_ from the
12249          *   top of the buffer to the wrapped offset.
12250          */
12251         if (buf->dtb_offset < buf->dtb_xamot_offset) {
12252                 bzero(buf->dtb_tomax + buf->dtb_offset,
12253                     buf->dtb_xamot_offset - buf->dtb_offset);
12254         }
12255
12256         if (buf->dtb_offset > buf->dtb_xamot_offset) {
12257                 bzero(buf->dtb_tomax + buf->dtb_offset,
12258                     buf->dtb_size - buf->dtb_offset);
12259                 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12260         }
12261 }
12262
12263 /*
12264  * This routine determines if data generated at the specified time has likely
12265  * been entirely consumed at user-level.  This routine is called to determine
12266  * if an ECB on a defunct probe (but for an active enabling) can be safely
12267  * disabled and destroyed.
12268  */
12269 static int
12270 dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
12271 {
12272         int i;
12273
12274         for (i = 0; i < NCPU; i++) {
12275                 dtrace_buffer_t *buf = &bufs[i];
12276
12277                 if (buf->dtb_size == 0)
12278                         continue;
12279
12280                 if (buf->dtb_flags & DTRACEBUF_RING)
12281                         return (0);
12282
12283                 if (!buf->dtb_switched && buf->dtb_offset != 0)
12284                         return (0);
12285
12286                 if (buf->dtb_switched - buf->dtb_interval < when)
12287                         return (0);
12288         }
12289
12290         return (1);
12291 }
12292
12293 static void
12294 dtrace_buffer_free(dtrace_buffer_t *bufs)
12295 {
12296         int i;
12297
12298         for (i = 0; i < NCPU; i++) {
12299                 dtrace_buffer_t *buf = &bufs[i];
12300
12301                 if (buf->dtb_tomax == NULL) {
12302                         ASSERT(buf->dtb_xamot == NULL);
12303                         ASSERT(buf->dtb_size == 0);
12304                         continue;
12305                 }
12306
12307                 if (buf->dtb_xamot != NULL) {
12308                         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12309                         kmem_free(buf->dtb_xamot, buf->dtb_size);
12310                 }
12311
12312                 kmem_free(buf->dtb_tomax, buf->dtb_size);
12313                 buf->dtb_size = 0;
12314                 buf->dtb_tomax = NULL;
12315                 buf->dtb_xamot = NULL;
12316         }
12317 }
12318
12319 /*
12320  * DTrace Enabling Functions
12321  */
12322 static dtrace_enabling_t *
12323 dtrace_enabling_create(dtrace_vstate_t *vstate)
12324 {
12325         dtrace_enabling_t *enab;
12326
12327         enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12328         enab->dten_vstate = vstate;
12329
12330         return (enab);
12331 }
12332
12333 static void
12334 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12335 {
12336         dtrace_ecbdesc_t **ndesc;
12337         size_t osize, nsize;
12338
12339         /*
12340          * We can't add to enablings after we've enabled them, or after we've
12341          * retained them.
12342          */
12343         ASSERT(enab->dten_probegen == 0);
12344         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12345
12346         if (enab->dten_ndesc < enab->dten_maxdesc) {
12347                 enab->dten_desc[enab->dten_ndesc++] = ecb;
12348                 return;
12349         }
12350
12351         osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12352
12353         if (enab->dten_maxdesc == 0) {
12354                 enab->dten_maxdesc = 1;
12355         } else {
12356                 enab->dten_maxdesc <<= 1;
12357         }
12358
12359         ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12360
12361         nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12362         ndesc = kmem_zalloc(nsize, KM_SLEEP);
12363         bcopy(enab->dten_desc, ndesc, osize);
12364         if (enab->dten_desc != NULL)
12365                 kmem_free(enab->dten_desc, osize);
12366
12367         enab->dten_desc = ndesc;
12368         enab->dten_desc[enab->dten_ndesc++] = ecb;
12369 }
12370
12371 static void
12372 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12373     dtrace_probedesc_t *pd)
12374 {
12375         dtrace_ecbdesc_t *new;
12376         dtrace_predicate_t *pred;
12377         dtrace_actdesc_t *act;
12378
12379         /*
12380          * We're going to create a new ECB description that matches the
12381          * specified ECB in every way, but has the specified probe description.
12382          */
12383         new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12384
12385         if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12386                 dtrace_predicate_hold(pred);
12387
12388         for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12389                 dtrace_actdesc_hold(act);
12390
12391         new->dted_action = ecb->dted_action;
12392         new->dted_pred = ecb->dted_pred;
12393         new->dted_probe = *pd;
12394         new->dted_uarg = ecb->dted_uarg;
12395
12396         dtrace_enabling_add(enab, new);
12397 }
12398
12399 static void
12400 dtrace_enabling_dump(dtrace_enabling_t *enab)
12401 {
12402         int i;
12403
12404         for (i = 0; i < enab->dten_ndesc; i++) {
12405                 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12406
12407                 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12408                     desc->dtpd_provider, desc->dtpd_mod,
12409                     desc->dtpd_func, desc->dtpd_name);
12410         }
12411 }
12412
12413 static void
12414 dtrace_enabling_destroy(dtrace_enabling_t *enab)
12415 {
12416         int i;
12417         dtrace_ecbdesc_t *ep;
12418         dtrace_vstate_t *vstate = enab->dten_vstate;
12419
12420         ASSERT(MUTEX_HELD(&dtrace_lock));
12421
12422         for (i = 0; i < enab->dten_ndesc; i++) {
12423                 dtrace_actdesc_t *act, *next;
12424                 dtrace_predicate_t *pred;
12425
12426                 ep = enab->dten_desc[i];
12427
12428                 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12429                         dtrace_predicate_release(pred, vstate);
12430
12431                 for (act = ep->dted_action; act != NULL; act = next) {
12432                         next = act->dtad_next;
12433                         dtrace_actdesc_release(act, vstate);
12434                 }
12435
12436                 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12437         }
12438
12439         if (enab->dten_desc != NULL)
12440                 kmem_free(enab->dten_desc,
12441                     enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12442
12443         /*
12444          * If this was a retained enabling, decrement the dts_nretained count
12445          * and take it off of the dtrace_retained list.
12446          */
12447         if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12448             dtrace_retained == enab) {
12449                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12450                 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12451                 enab->dten_vstate->dtvs_state->dts_nretained--;
12452                 dtrace_retained_gen++;
12453         }
12454
12455         if (enab->dten_prev == NULL) {
12456                 if (dtrace_retained == enab) {
12457                         dtrace_retained = enab->dten_next;
12458
12459                         if (dtrace_retained != NULL)
12460                                 dtrace_retained->dten_prev = NULL;
12461                 }
12462         } else {
12463                 ASSERT(enab != dtrace_retained);
12464                 ASSERT(dtrace_retained != NULL);
12465                 enab->dten_prev->dten_next = enab->dten_next;
12466         }
12467
12468         if (enab->dten_next != NULL) {
12469                 ASSERT(dtrace_retained != NULL);
12470                 enab->dten_next->dten_prev = enab->dten_prev;
12471         }
12472
12473         kmem_free(enab, sizeof (dtrace_enabling_t));
12474 }
12475
12476 static int
12477 dtrace_enabling_retain(dtrace_enabling_t *enab)
12478 {
12479         dtrace_state_t *state;
12480
12481         ASSERT(MUTEX_HELD(&dtrace_lock));
12482         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12483         ASSERT(enab->dten_vstate != NULL);
12484
12485         state = enab->dten_vstate->dtvs_state;
12486         ASSERT(state != NULL);
12487
12488         /*
12489          * We only allow each state to retain dtrace_retain_max enablings.
12490          */
12491         if (state->dts_nretained >= dtrace_retain_max)
12492                 return (ENOSPC);
12493
12494         state->dts_nretained++;
12495         dtrace_retained_gen++;
12496
12497         if (dtrace_retained == NULL) {
12498                 dtrace_retained = enab;
12499                 return (0);
12500         }
12501
12502         enab->dten_next = dtrace_retained;
12503         dtrace_retained->dten_prev = enab;
12504         dtrace_retained = enab;
12505
12506         return (0);
12507 }
12508
12509 static int
12510 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12511     dtrace_probedesc_t *create)
12512 {
12513         dtrace_enabling_t *new, *enab;
12514         int found = 0, err = ENOENT;
12515
12516         ASSERT(MUTEX_HELD(&dtrace_lock));
12517         ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12518         ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12519         ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12520         ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12521
12522         new = dtrace_enabling_create(&state->dts_vstate);
12523
12524         /*
12525          * Iterate over all retained enablings, looking for enablings that
12526          * match the specified state.
12527          */
12528         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12529                 int i;
12530
12531                 /*
12532                  * dtvs_state can only be NULL for helper enablings -- and
12533                  * helper enablings can't be retained.
12534                  */
12535                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12536
12537                 if (enab->dten_vstate->dtvs_state != state)
12538                         continue;
12539
12540                 /*
12541                  * Now iterate over each probe description; we're looking for
12542                  * an exact match to the specified probe description.
12543                  */
12544                 for (i = 0; i < enab->dten_ndesc; i++) {
12545                         dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12546                         dtrace_probedesc_t *pd = &ep->dted_probe;
12547
12548                         if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12549                                 continue;
12550
12551                         if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12552                                 continue;
12553
12554                         if (strcmp(pd->dtpd_func, match->dtpd_func))
12555                                 continue;
12556
12557                         if (strcmp(pd->dtpd_name, match->dtpd_name))
12558                                 continue;
12559
12560                         /*
12561                          * We have a winning probe!  Add it to our growing
12562                          * enabling.
12563                          */
12564                         found = 1;
12565                         dtrace_enabling_addlike(new, ep, create);
12566                 }
12567         }
12568
12569         if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12570                 dtrace_enabling_destroy(new);
12571                 return (err);
12572         }
12573
12574         return (0);
12575 }
12576
12577 static void
12578 dtrace_enabling_retract(dtrace_state_t *state)
12579 {
12580         dtrace_enabling_t *enab, *next;
12581
12582         ASSERT(MUTEX_HELD(&dtrace_lock));
12583
12584         /*
12585          * Iterate over all retained enablings, destroy the enablings retained
12586          * for the specified state.
12587          */
12588         for (enab = dtrace_retained; enab != NULL; enab = next) {
12589                 next = enab->dten_next;
12590
12591                 /*
12592                  * dtvs_state can only be NULL for helper enablings -- and
12593                  * helper enablings can't be retained.
12594                  */
12595                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12596
12597                 if (enab->dten_vstate->dtvs_state == state) {
12598                         ASSERT(state->dts_nretained > 0);
12599                         dtrace_enabling_destroy(enab);
12600                 }
12601         }
12602
12603         ASSERT(state->dts_nretained == 0);
12604 }
12605
12606 static int
12607 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12608 {
12609         int i = 0;
12610         int matched = 0;
12611
12612         ASSERT(MUTEX_HELD(&cpu_lock));
12613         ASSERT(MUTEX_HELD(&dtrace_lock));
12614
12615         for (i = 0; i < enab->dten_ndesc; i++) {
12616                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12617
12618                 enab->dten_current = ep;
12619                 enab->dten_error = 0;
12620
12621                 matched += dtrace_probe_enable(&ep->dted_probe, enab);
12622
12623                 if (enab->dten_error != 0) {
12624                         /*
12625                          * If we get an error half-way through enabling the
12626                          * probes, we kick out -- perhaps with some number of
12627                          * them enabled.  Leaving enabled probes enabled may
12628                          * be slightly confusing for user-level, but we expect
12629                          * that no one will attempt to actually drive on in
12630                          * the face of such errors.  If this is an anonymous
12631                          * enabling (indicated with a NULL nmatched pointer),
12632                          * we cmn_err() a message.  We aren't expecting to
12633                          * get such an error -- such as it can exist at all,
12634                          * it would be a result of corrupted DOF in the driver
12635                          * properties.
12636                          */
12637                         if (nmatched == NULL) {
12638                                 cmn_err(CE_WARN, "dtrace_enabling_match() "
12639                                     "error on %p: %d", (void *)ep,
12640                                     enab->dten_error);
12641                         }
12642
12643                         return (enab->dten_error);
12644                 }
12645         }
12646
12647         enab->dten_probegen = dtrace_probegen;
12648         if (nmatched != NULL)
12649                 *nmatched = matched;
12650
12651         return (0);
12652 }
12653
12654 static void
12655 dtrace_enabling_matchall(void)
12656 {
12657         dtrace_enabling_t *enab;
12658
12659         mutex_enter(&cpu_lock);
12660         mutex_enter(&dtrace_lock);
12661
12662         /*
12663          * Iterate over all retained enablings to see if any probes match
12664          * against them.  We only perform this operation on enablings for which
12665          * we have sufficient permissions by virtue of being in the global zone
12666          * or in the same zone as the DTrace client.  Because we can be called
12667          * after dtrace_detach() has been called, we cannot assert that there
12668          * are retained enablings.  We can safely load from dtrace_retained,
12669          * however:  the taskq_destroy() at the end of dtrace_detach() will
12670          * block pending our completion.
12671          */
12672         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12673 #if defined(sun)
12674                 cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
12675
12676                 if (INGLOBALZONE(curproc) ||
12677                     cr != NULL && getzoneid() == crgetzoneid(cr))
12678 #endif
12679                         (void) dtrace_enabling_match(enab, NULL);
12680         }
12681
12682         mutex_exit(&dtrace_lock);
12683         mutex_exit(&cpu_lock);
12684 }
12685
12686 /*
12687  * If an enabling is to be enabled without having matched probes (that is, if
12688  * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12689  * enabling must be _primed_ by creating an ECB for every ECB description.
12690  * This must be done to assure that we know the number of speculations, the
12691  * number of aggregations, the minimum buffer size needed, etc. before we
12692  * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12693  * enabling any probes, we create ECBs for every ECB decription, but with a
12694  * NULL probe -- which is exactly what this function does.
12695  */
12696 static void
12697 dtrace_enabling_prime(dtrace_state_t *state)
12698 {
12699         dtrace_enabling_t *enab;
12700         int i;
12701
12702         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12703                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12704
12705                 if (enab->dten_vstate->dtvs_state != state)
12706                         continue;
12707
12708                 /*
12709                  * We don't want to prime an enabling more than once, lest
12710                  * we allow a malicious user to induce resource exhaustion.
12711                  * (The ECBs that result from priming an enabling aren't
12712                  * leaked -- but they also aren't deallocated until the
12713                  * consumer state is destroyed.)
12714                  */
12715                 if (enab->dten_primed)
12716                         continue;
12717
12718                 for (i = 0; i < enab->dten_ndesc; i++) {
12719                         enab->dten_current = enab->dten_desc[i];
12720                         (void) dtrace_probe_enable(NULL, enab);
12721                 }
12722
12723                 enab->dten_primed = 1;
12724         }
12725 }
12726
12727 /*
12728  * Called to indicate that probes should be provided due to retained
12729  * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12730  * must take an initial lap through the enabling calling the dtps_provide()
12731  * entry point explicitly to allow for autocreated probes.
12732  */
12733 static void
12734 dtrace_enabling_provide(dtrace_provider_t *prv)
12735 {
12736         int i, all = 0;
12737         dtrace_probedesc_t desc;
12738         dtrace_genid_t gen;
12739
12740         ASSERT(MUTEX_HELD(&dtrace_lock));
12741         ASSERT(MUTEX_HELD(&dtrace_provider_lock));
12742
12743         if (prv == NULL) {
12744                 all = 1;
12745                 prv = dtrace_provider;
12746         }
12747
12748         do {
12749                 dtrace_enabling_t *enab;
12750                 void *parg = prv->dtpv_arg;
12751
12752 retry:
12753                 gen = dtrace_retained_gen;
12754                 for (enab = dtrace_retained; enab != NULL;
12755                     enab = enab->dten_next) {
12756                         for (i = 0; i < enab->dten_ndesc; i++) {
12757                                 desc = enab->dten_desc[i]->dted_probe;
12758                                 mutex_exit(&dtrace_lock);
12759                                 prv->dtpv_pops.dtps_provide(parg, &desc);
12760                                 mutex_enter(&dtrace_lock);
12761                                 /*
12762                                  * Process the retained enablings again if
12763                                  * they have changed while we weren't holding
12764                                  * dtrace_lock.
12765                                  */
12766                                 if (gen != dtrace_retained_gen)
12767                                         goto retry;
12768                         }
12769                 }
12770         } while (all && (prv = prv->dtpv_next) != NULL);
12771
12772         mutex_exit(&dtrace_lock);
12773         dtrace_probe_provide(NULL, all ? NULL : prv);
12774         mutex_enter(&dtrace_lock);
12775 }
12776
12777 /*
12778  * Called to reap ECBs that are attached to probes from defunct providers.
12779  */
12780 static void
12781 dtrace_enabling_reap(void)
12782 {
12783         dtrace_provider_t *prov;
12784         dtrace_probe_t *probe;
12785         dtrace_ecb_t *ecb;
12786         hrtime_t when;
12787         int i;
12788
12789         mutex_enter(&cpu_lock);
12790         mutex_enter(&dtrace_lock);
12791
12792         for (i = 0; i < dtrace_nprobes; i++) {
12793                 if ((probe = dtrace_probes[i]) == NULL)
12794                         continue;
12795
12796                 if (probe->dtpr_ecb == NULL)
12797                         continue;
12798
12799                 prov = probe->dtpr_provider;
12800
12801                 if ((when = prov->dtpv_defunct) == 0)
12802                         continue;
12803
12804                 /*
12805                  * We have ECBs on a defunct provider:  we want to reap these
12806                  * ECBs to allow the provider to unregister.  The destruction
12807                  * of these ECBs must be done carefully:  if we destroy the ECB
12808                  * and the consumer later wishes to consume an EPID that
12809                  * corresponds to the destroyed ECB (and if the EPID metadata
12810                  * has not been previously consumed), the consumer will abort
12811                  * processing on the unknown EPID.  To reduce (but not, sadly,
12812                  * eliminate) the possibility of this, we will only destroy an
12813                  * ECB for a defunct provider if, for the state that
12814                  * corresponds to the ECB:
12815                  *
12816                  *  (a) There is no speculative tracing (which can effectively
12817                  *      cache an EPID for an arbitrary amount of time).
12818                  *
12819                  *  (b) The principal buffers have been switched twice since the
12820                  *      provider became defunct.
12821                  *
12822                  *  (c) The aggregation buffers are of zero size or have been
12823                  *      switched twice since the provider became defunct.
12824                  *
12825                  * We use dts_speculates to determine (a) and call a function
12826                  * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
12827                  * that as soon as we've been unable to destroy one of the ECBs
12828                  * associated with the probe, we quit trying -- reaping is only
12829                  * fruitful in as much as we can destroy all ECBs associated
12830                  * with the defunct provider's probes.
12831                  */
12832                 while ((ecb = probe->dtpr_ecb) != NULL) {
12833                         dtrace_state_t *state = ecb->dte_state;
12834                         dtrace_buffer_t *buf = state->dts_buffer;
12835                         dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
12836
12837                         if (state->dts_speculates)
12838                                 break;
12839
12840                         if (!dtrace_buffer_consumed(buf, when))
12841                                 break;
12842
12843                         if (!dtrace_buffer_consumed(aggbuf, when))
12844                                 break;
12845
12846                         dtrace_ecb_disable(ecb);
12847                         ASSERT(probe->dtpr_ecb != ecb);
12848                         dtrace_ecb_destroy(ecb);
12849                 }
12850         }
12851
12852         mutex_exit(&dtrace_lock);
12853         mutex_exit(&cpu_lock);
12854 }
12855
12856 /*
12857  * DTrace DOF Functions
12858  */
12859 /*ARGSUSED*/
12860 static void
12861 dtrace_dof_error(dof_hdr_t *dof, const char *str)
12862 {
12863         if (dtrace_err_verbose)
12864                 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12865
12866 #ifdef DTRACE_ERRDEBUG
12867         dtrace_errdebug(str);
12868 #endif
12869 }
12870
12871 /*
12872  * Create DOF out of a currently enabled state.  Right now, we only create
12873  * DOF containing the run-time options -- but this could be expanded to create
12874  * complete DOF representing the enabled state.
12875  */
12876 static dof_hdr_t *
12877 dtrace_dof_create(dtrace_state_t *state)
12878 {
12879         dof_hdr_t *dof;
12880         dof_sec_t *sec;
12881         dof_optdesc_t *opt;
12882         int i, len = sizeof (dof_hdr_t) +
12883             roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12884             sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12885
12886         ASSERT(MUTEX_HELD(&dtrace_lock));
12887
12888         dof = kmem_zalloc(len, KM_SLEEP);
12889         dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12890         dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12891         dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12892         dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12893
12894         dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12895         dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12896         dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12897         dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12898         dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12899         dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12900
12901         dof->dofh_flags = 0;
12902         dof->dofh_hdrsize = sizeof (dof_hdr_t);
12903         dof->dofh_secsize = sizeof (dof_sec_t);
12904         dof->dofh_secnum = 1;   /* only DOF_SECT_OPTDESC */
12905         dof->dofh_secoff = sizeof (dof_hdr_t);
12906         dof->dofh_loadsz = len;
12907         dof->dofh_filesz = len;
12908         dof->dofh_pad = 0;
12909
12910         /*
12911          * Fill in the option section header...
12912          */
12913         sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12914         sec->dofs_type = DOF_SECT_OPTDESC;
12915         sec->dofs_align = sizeof (uint64_t);
12916         sec->dofs_flags = DOF_SECF_LOAD;
12917         sec->dofs_entsize = sizeof (dof_optdesc_t);
12918
12919         opt = (dof_optdesc_t *)((uintptr_t)sec +
12920             roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12921
12922         sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12923         sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12924
12925         for (i = 0; i < DTRACEOPT_MAX; i++) {
12926                 opt[i].dofo_option = i;
12927                 opt[i].dofo_strtab = DOF_SECIDX_NONE;
12928                 opt[i].dofo_value = state->dts_options[i];
12929         }
12930
12931         return (dof);
12932 }
12933
12934 static dof_hdr_t *
12935 dtrace_dof_copyin(uintptr_t uarg, int *errp)
12936 {
12937         dof_hdr_t hdr, *dof;
12938
12939         ASSERT(!MUTEX_HELD(&dtrace_lock));
12940
12941         /*
12942          * First, we're going to copyin() the sizeof (dof_hdr_t).
12943          */
12944         if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
12945                 dtrace_dof_error(NULL, "failed to copyin DOF header");
12946                 *errp = EFAULT;
12947                 return (NULL);
12948         }
12949
12950         /*
12951          * Now we'll allocate the entire DOF and copy it in -- provided
12952          * that the length isn't outrageous.
12953          */
12954         if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
12955                 dtrace_dof_error(&hdr, "load size exceeds maximum");
12956                 *errp = E2BIG;
12957                 return (NULL);
12958         }
12959
12960         if (hdr.dofh_loadsz < sizeof (hdr)) {
12961                 dtrace_dof_error(&hdr, "invalid load size");
12962                 *errp = EINVAL;
12963                 return (NULL);
12964         }
12965
12966         dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
12967
12968         if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
12969             dof->dofh_loadsz != hdr.dofh_loadsz) {
12970                 kmem_free(dof, hdr.dofh_loadsz);
12971                 *errp = EFAULT;
12972                 return (NULL);
12973         }
12974
12975         return (dof);
12976 }
12977
12978 #if !defined(sun)
12979 static __inline uchar_t
12980 dtrace_dof_char(char c) {
12981         switch (c) {
12982         case '0':
12983         case '1':
12984         case '2':
12985         case '3':
12986         case '4':
12987         case '5':
12988         case '6':
12989         case '7':
12990         case '8':
12991         case '9':
12992                 return (c - '0');
12993         case 'A':
12994         case 'B':
12995         case 'C':
12996         case 'D':
12997         case 'E':
12998         case 'F':
12999                 return (c - 'A' + 10);
13000         case 'a':
13001         case 'b':
13002         case 'c':
13003         case 'd':
13004         case 'e':
13005         case 'f':
13006                 return (c - 'a' + 10);
13007         }
13008         /* Should not reach here. */
13009         return (0);
13010 }
13011 #endif
13012
13013 static dof_hdr_t *
13014 dtrace_dof_property(const char *name)
13015 {
13016         uchar_t *buf;
13017         uint64_t loadsz;
13018         unsigned int len, i;
13019         dof_hdr_t *dof;
13020
13021 #if defined(sun)
13022         /*
13023          * Unfortunately, array of values in .conf files are always (and
13024          * only) interpreted to be integer arrays.  We must read our DOF
13025          * as an integer array, and then squeeze it into a byte array.
13026          */
13027         if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
13028             (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
13029                 return (NULL);
13030
13031         for (i = 0; i < len; i++)
13032                 buf[i] = (uchar_t)(((int *)buf)[i]);
13033
13034         if (len < sizeof (dof_hdr_t)) {
13035                 ddi_prop_free(buf);
13036                 dtrace_dof_error(NULL, "truncated header");
13037                 return (NULL);
13038         }
13039
13040         if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
13041                 ddi_prop_free(buf);
13042                 dtrace_dof_error(NULL, "truncated DOF");
13043                 return (NULL);
13044         }
13045
13046         if (loadsz >= dtrace_dof_maxsize) {
13047                 ddi_prop_free(buf);
13048                 dtrace_dof_error(NULL, "oversized DOF");
13049                 return (NULL);
13050         }
13051
13052         dof = kmem_alloc(loadsz, KM_SLEEP);
13053         bcopy(buf, dof, loadsz);
13054         ddi_prop_free(buf);
13055 #else
13056         char *p;
13057         char *p_env;
13058
13059         if ((p_env = getenv(name)) == NULL)
13060                 return (NULL);
13061
13062         len = strlen(p_env) / 2;
13063
13064         buf = kmem_alloc(len, KM_SLEEP);
13065
13066         dof = (dof_hdr_t *) buf;
13067
13068         p = p_env;
13069
13070         for (i = 0; i < len; i++) {
13071                 buf[i] = (dtrace_dof_char(p[0]) << 4) |
13072                      dtrace_dof_char(p[1]);
13073                 p += 2;
13074         }
13075
13076         freeenv(p_env);
13077
13078         if (len < sizeof (dof_hdr_t)) {
13079                 kmem_free(buf, 0);
13080                 dtrace_dof_error(NULL, "truncated header");
13081                 return (NULL);
13082         }
13083
13084         if (len < (loadsz = dof->dofh_loadsz)) {
13085                 kmem_free(buf, 0);
13086                 dtrace_dof_error(NULL, "truncated DOF");
13087                 return (NULL);
13088         }
13089
13090         if (loadsz >= dtrace_dof_maxsize) {
13091                 kmem_free(buf, 0);
13092                 dtrace_dof_error(NULL, "oversized DOF");
13093                 return (NULL);
13094         }
13095 #endif
13096
13097         return (dof);
13098 }
13099
13100 static void
13101 dtrace_dof_destroy(dof_hdr_t *dof)
13102 {
13103         kmem_free(dof, dof->dofh_loadsz);
13104 }
13105
13106 /*
13107  * Return the dof_sec_t pointer corresponding to a given section index.  If the
13108  * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
13109  * a type other than DOF_SECT_NONE is specified, the header is checked against
13110  * this type and NULL is returned if the types do not match.
13111  */
13112 static dof_sec_t *
13113 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13114 {
13115         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13116             ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13117
13118         if (i >= dof->dofh_secnum) {
13119                 dtrace_dof_error(dof, "referenced section index is invalid");
13120                 return (NULL);
13121         }
13122
13123         if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13124                 dtrace_dof_error(dof, "referenced section is not loadable");
13125                 return (NULL);
13126         }
13127
13128         if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13129                 dtrace_dof_error(dof, "referenced section is the wrong type");
13130                 return (NULL);
13131         }
13132
13133         return (sec);
13134 }
13135
13136 static dtrace_probedesc_t *
13137 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13138 {
13139         dof_probedesc_t *probe;
13140         dof_sec_t *strtab;
13141         uintptr_t daddr = (uintptr_t)dof;
13142         uintptr_t str;
13143         size_t size;
13144
13145         if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13146                 dtrace_dof_error(dof, "invalid probe section");
13147                 return (NULL);
13148         }
13149
13150         if (sec->dofs_align != sizeof (dof_secidx_t)) {
13151                 dtrace_dof_error(dof, "bad alignment in probe description");
13152                 return (NULL);
13153         }
13154
13155         if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13156                 dtrace_dof_error(dof, "truncated probe description");
13157                 return (NULL);
13158         }
13159
13160         probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13161         strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13162
13163         if (strtab == NULL)
13164                 return (NULL);
13165
13166         str = daddr + strtab->dofs_offset;
13167         size = strtab->dofs_size;
13168
13169         if (probe->dofp_provider >= strtab->dofs_size) {
13170                 dtrace_dof_error(dof, "corrupt probe provider");
13171                 return (NULL);
13172         }
13173
13174         (void) strncpy(desc->dtpd_provider,
13175             (char *)(str + probe->dofp_provider),
13176             MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13177
13178         if (probe->dofp_mod >= strtab->dofs_size) {
13179                 dtrace_dof_error(dof, "corrupt probe module");
13180                 return (NULL);
13181         }
13182
13183         (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13184             MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13185
13186         if (probe->dofp_func >= strtab->dofs_size) {
13187                 dtrace_dof_error(dof, "corrupt probe function");
13188                 return (NULL);
13189         }
13190
13191         (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13192             MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13193
13194         if (probe->dofp_name >= strtab->dofs_size) {
13195                 dtrace_dof_error(dof, "corrupt probe name");
13196                 return (NULL);
13197         }
13198
13199         (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13200             MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13201
13202         return (desc);
13203 }
13204
13205 static dtrace_difo_t *
13206 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13207     cred_t *cr)
13208 {
13209         dtrace_difo_t *dp;
13210         size_t ttl = 0;
13211         dof_difohdr_t *dofd;
13212         uintptr_t daddr = (uintptr_t)dof;
13213         size_t max = dtrace_difo_maxsize;
13214         int i, l, n;
13215
13216         static const struct {
13217                 int section;
13218                 int bufoffs;
13219                 int lenoffs;
13220                 int entsize;
13221                 int align;
13222                 const char *msg;
13223         } difo[] = {
13224                 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13225                 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13226                 sizeof (dif_instr_t), "multiple DIF sections" },
13227
13228                 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13229                 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13230                 sizeof (uint64_t), "multiple integer tables" },
13231
13232                 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13233                 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13234                 sizeof (char), "multiple string tables" },
13235
13236                 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13237                 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13238                 sizeof (uint_t), "multiple variable tables" },
13239
13240                 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13241         };
13242
13243         if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13244                 dtrace_dof_error(dof, "invalid DIFO header section");
13245                 return (NULL);
13246         }
13247
13248         if (sec->dofs_align != sizeof (dof_secidx_t)) {
13249                 dtrace_dof_error(dof, "bad alignment in DIFO header");
13250                 return (NULL);
13251         }
13252
13253         if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13254             sec->dofs_size % sizeof (dof_secidx_t)) {
13255                 dtrace_dof_error(dof, "bad size in DIFO header");
13256                 return (NULL);
13257         }
13258
13259         dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13260         n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13261
13262         dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13263         dp->dtdo_rtype = dofd->dofd_rtype;
13264
13265         for (l = 0; l < n; l++) {
13266                 dof_sec_t *subsec;
13267                 void **bufp;
13268                 uint32_t *lenp;
13269
13270                 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13271                     dofd->dofd_links[l])) == NULL)
13272                         goto err; /* invalid section link */
13273
13274                 if (ttl + subsec->dofs_size > max) {
13275                         dtrace_dof_error(dof, "exceeds maximum size");
13276                         goto err;
13277                 }
13278
13279                 ttl += subsec->dofs_size;
13280
13281                 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13282                         if (subsec->dofs_type != difo[i].section)
13283                                 continue;
13284
13285                         if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13286                                 dtrace_dof_error(dof, "section not loaded");
13287                                 goto err;
13288                         }
13289
13290                         if (subsec->dofs_align != difo[i].align) {
13291                                 dtrace_dof_error(dof, "bad alignment");
13292                                 goto err;
13293                         }
13294
13295                         bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13296                         lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13297
13298                         if (*bufp != NULL) {
13299                                 dtrace_dof_error(dof, difo[i].msg);
13300                                 goto err;
13301                         }
13302
13303                         if (difo[i].entsize != subsec->dofs_entsize) {
13304                                 dtrace_dof_error(dof, "entry size mismatch");
13305                                 goto err;
13306                         }
13307
13308                         if (subsec->dofs_entsize != 0 &&
13309                             (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13310                                 dtrace_dof_error(dof, "corrupt entry size");
13311                                 goto err;
13312                         }
13313
13314                         *lenp = subsec->dofs_size;
13315                         *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13316                         bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13317                             *bufp, subsec->dofs_size);
13318
13319                         if (subsec->dofs_entsize != 0)
13320                                 *lenp /= subsec->dofs_entsize;
13321
13322                         break;
13323                 }
13324
13325                 /*
13326                  * If we encounter a loadable DIFO sub-section that is not
13327                  * known to us, assume this is a broken program and fail.
13328                  */
13329                 if (difo[i].section == DOF_SECT_NONE &&
13330                     (subsec->dofs_flags & DOF_SECF_LOAD)) {
13331                         dtrace_dof_error(dof, "unrecognized DIFO subsection");
13332                         goto err;
13333                 }
13334         }
13335
13336         if (dp->dtdo_buf == NULL) {
13337                 /*
13338                  * We can't have a DIF object without DIF text.
13339                  */
13340                 dtrace_dof_error(dof, "missing DIF text");
13341                 goto err;
13342         }
13343
13344         /*
13345          * Before we validate the DIF object, run through the variable table
13346          * looking for the strings -- if any of their size are under, we'll set
13347          * their size to be the system-wide default string size.  Note that
13348          * this should _not_ happen if the "strsize" option has been set --
13349          * in this case, the compiler should have set the size to reflect the
13350          * setting of the option.
13351          */
13352         for (i = 0; i < dp->dtdo_varlen; i++) {
13353                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13354                 dtrace_diftype_t *t = &v->dtdv_type;
13355
13356                 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13357                         continue;
13358
13359                 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13360                         t->dtdt_size = dtrace_strsize_default;
13361         }
13362
13363         if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13364                 goto err;
13365
13366         dtrace_difo_init(dp, vstate);
13367         return (dp);
13368
13369 err:
13370         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13371         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13372         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13373         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13374
13375         kmem_free(dp, sizeof (dtrace_difo_t));
13376         return (NULL);
13377 }
13378
13379 static dtrace_predicate_t *
13380 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13381     cred_t *cr)
13382 {
13383         dtrace_difo_t *dp;
13384
13385         if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13386                 return (NULL);
13387
13388         return (dtrace_predicate_create(dp));
13389 }
13390
13391 static dtrace_actdesc_t *
13392 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13393     cred_t *cr)
13394 {
13395         dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13396         dof_actdesc_t *desc;
13397         dof_sec_t *difosec;
13398         size_t offs;
13399         uintptr_t daddr = (uintptr_t)dof;
13400         uint64_t arg;
13401         dtrace_actkind_t kind;
13402
13403         if (sec->dofs_type != DOF_SECT_ACTDESC) {
13404                 dtrace_dof_error(dof, "invalid action section");
13405                 return (NULL);
13406         }
13407
13408         if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13409                 dtrace_dof_error(dof, "truncated action description");
13410                 return (NULL);
13411         }
13412
13413         if (sec->dofs_align != sizeof (uint64_t)) {
13414                 dtrace_dof_error(dof, "bad alignment in action description");
13415                 return (NULL);
13416         }
13417
13418         if (sec->dofs_size < sec->dofs_entsize) {
13419                 dtrace_dof_error(dof, "section entry size exceeds total size");
13420                 return (NULL);
13421         }
13422
13423         if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13424                 dtrace_dof_error(dof, "bad entry size in action description");
13425                 return (NULL);
13426         }
13427
13428         if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13429                 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13430                 return (NULL);
13431         }
13432
13433         for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13434                 desc = (dof_actdesc_t *)(daddr +
13435                     (uintptr_t)sec->dofs_offset + offs);
13436                 kind = (dtrace_actkind_t)desc->dofa_kind;
13437
13438                 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13439                     (kind != DTRACEACT_PRINTA ||
13440                     desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13441                     (kind == DTRACEACT_DIFEXPR &&
13442                     desc->dofa_strtab != DOF_SECIDX_NONE)) {
13443                         dof_sec_t *strtab;
13444                         char *str, *fmt;
13445                         uint64_t i;
13446
13447                         /*
13448                          * The argument to these actions is an index into the
13449                          * DOF string table.  For printf()-like actions, this
13450                          * is the format string.  For print(), this is the
13451                          * CTF type of the expression result.
13452                          */
13453                         if ((strtab = dtrace_dof_sect(dof,
13454                             DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13455                                 goto err;
13456
13457                         str = (char *)((uintptr_t)dof +
13458                             (uintptr_t)strtab->dofs_offset);
13459
13460                         for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13461                                 if (str[i] == '\0')
13462                                         break;
13463                         }
13464
13465                         if (i >= strtab->dofs_size) {
13466                                 dtrace_dof_error(dof, "bogus format string");
13467                                 goto err;
13468                         }
13469
13470                         if (i == desc->dofa_arg) {
13471                                 dtrace_dof_error(dof, "empty format string");
13472                                 goto err;
13473                         }
13474
13475                         i -= desc->dofa_arg;
13476                         fmt = kmem_alloc(i + 1, KM_SLEEP);
13477                         bcopy(&str[desc->dofa_arg], fmt, i + 1);
13478                         arg = (uint64_t)(uintptr_t)fmt;
13479                 } else {
13480                         if (kind == DTRACEACT_PRINTA) {
13481                                 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13482                                 arg = 0;
13483                         } else {
13484                                 arg = desc->dofa_arg;
13485                         }
13486                 }
13487
13488                 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13489                     desc->dofa_uarg, arg);
13490
13491                 if (last != NULL) {
13492                         last->dtad_next = act;
13493                 } else {
13494                         first = act;
13495                 }
13496
13497                 last = act;
13498
13499                 if (desc->dofa_difo == DOF_SECIDX_NONE)
13500                         continue;
13501
13502                 if ((difosec = dtrace_dof_sect(dof,
13503                     DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13504                         goto err;
13505
13506                 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13507
13508                 if (act->dtad_difo == NULL)
13509                         goto err;
13510         }
13511
13512         ASSERT(first != NULL);
13513         return (first);
13514
13515 err:
13516         for (act = first; act != NULL; act = next) {
13517                 next = act->dtad_next;
13518                 dtrace_actdesc_release(act, vstate);
13519         }
13520
13521         return (NULL);
13522 }
13523
13524 static dtrace_ecbdesc_t *
13525 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13526     cred_t *cr)
13527 {
13528         dtrace_ecbdesc_t *ep;
13529         dof_ecbdesc_t *ecb;
13530         dtrace_probedesc_t *desc;
13531         dtrace_predicate_t *pred = NULL;
13532
13533         if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13534                 dtrace_dof_error(dof, "truncated ECB description");
13535                 return (NULL);
13536         }
13537
13538         if (sec->dofs_align != sizeof (uint64_t)) {
13539                 dtrace_dof_error(dof, "bad alignment in ECB description");
13540                 return (NULL);
13541         }
13542
13543         ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13544         sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13545
13546         if (sec == NULL)
13547                 return (NULL);
13548
13549         ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13550         ep->dted_uarg = ecb->dofe_uarg;
13551         desc = &ep->dted_probe;
13552
13553         if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13554                 goto err;
13555
13556         if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13557                 if ((sec = dtrace_dof_sect(dof,
13558                     DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13559                         goto err;
13560
13561                 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13562                         goto err;
13563
13564                 ep->dted_pred.dtpdd_predicate = pred;
13565         }
13566
13567         if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13568                 if ((sec = dtrace_dof_sect(dof,
13569                     DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13570                         goto err;
13571
13572                 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13573
13574                 if (ep->dted_action == NULL)
13575                         goto err;
13576         }
13577
13578         return (ep);
13579
13580 err:
13581         if (pred != NULL)
13582                 dtrace_predicate_release(pred, vstate);
13583         kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13584         return (NULL);
13585 }
13586
13587 /*
13588  * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
13589  * specified DOF.  At present, this amounts to simply adding 'ubase' to the
13590  * site of any user SETX relocations to account for load object base address.
13591  * In the future, if we need other relocations, this function can be extended.
13592  */
13593 static int
13594 dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
13595 {
13596         uintptr_t daddr = (uintptr_t)dof;
13597         dof_relohdr_t *dofr =
13598             (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13599         dof_sec_t *ss, *rs, *ts;
13600         dof_relodesc_t *r;
13601         uint_t i, n;
13602
13603         if (sec->dofs_size < sizeof (dof_relohdr_t) ||
13604             sec->dofs_align != sizeof (dof_secidx_t)) {
13605                 dtrace_dof_error(dof, "invalid relocation header");
13606                 return (-1);
13607         }
13608
13609         ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
13610         rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
13611         ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
13612
13613         if (ss == NULL || rs == NULL || ts == NULL)
13614                 return (-1); /* dtrace_dof_error() has been called already */
13615
13616         if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
13617             rs->dofs_align != sizeof (uint64_t)) {
13618                 dtrace_dof_error(dof, "invalid relocation section");
13619                 return (-1);
13620         }
13621
13622         r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
13623         n = rs->dofs_size / rs->dofs_entsize;
13624
13625         for (i = 0; i < n; i++) {
13626                 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
13627
13628                 switch (r->dofr_type) {
13629                 case DOF_RELO_NONE:
13630                         break;
13631                 case DOF_RELO_SETX:
13632                         if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
13633                             sizeof (uint64_t) > ts->dofs_size) {
13634                                 dtrace_dof_error(dof, "bad relocation offset");
13635                                 return (-1);
13636                         }
13637
13638                         if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
13639                                 dtrace_dof_error(dof, "misaligned setx relo");
13640                                 return (-1);
13641                         }
13642
13643                         *(uint64_t *)taddr += ubase;
13644                         break;
13645                 default:
13646                         dtrace_dof_error(dof, "invalid relocation type");
13647                         return (-1);
13648                 }
13649
13650                 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
13651         }
13652
13653         return (0);
13654 }
13655
13656 /*
13657  * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13658  * header:  it should be at the front of a memory region that is at least
13659  * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13660  * size.  It need not be validated in any other way.
13661  */
13662 static int
13663 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13664     dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13665 {
13666         uint64_t len = dof->dofh_loadsz, seclen;
13667         uintptr_t daddr = (uintptr_t)dof;
13668         dtrace_ecbdesc_t *ep;
13669         dtrace_enabling_t *enab;
13670         uint_t i;
13671
13672         ASSERT(MUTEX_HELD(&dtrace_lock));
13673         ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13674
13675         /*
13676          * Check the DOF header identification bytes.  In addition to checking
13677          * valid settings, we also verify that unused bits/bytes are zeroed so
13678          * we can use them later without fear of regressing existing binaries.
13679          */
13680         if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13681             DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13682                 dtrace_dof_error(dof, "DOF magic string mismatch");
13683                 return (-1);
13684         }
13685
13686         if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13687             dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13688                 dtrace_dof_error(dof, "DOF has invalid data model");
13689                 return (-1);
13690         }
13691
13692         if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13693                 dtrace_dof_error(dof, "DOF encoding mismatch");
13694                 return (-1);
13695         }
13696
13697         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13698             dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
13699                 dtrace_dof_error(dof, "DOF version mismatch");
13700                 return (-1);
13701         }
13702
13703         if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13704                 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13705                 return (-1);
13706         }
13707
13708         if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13709                 dtrace_dof_error(dof, "DOF uses too many integer registers");
13710                 return (-1);
13711         }
13712
13713         if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13714                 dtrace_dof_error(dof, "DOF uses too many tuple registers");
13715                 return (-1);
13716         }
13717
13718         for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13719                 if (dof->dofh_ident[i] != 0) {
13720                         dtrace_dof_error(dof, "DOF has invalid ident byte set");
13721                         return (-1);
13722                 }
13723         }
13724
13725         if (dof->dofh_flags & ~DOF_FL_VALID) {
13726                 dtrace_dof_error(dof, "DOF has invalid flag bits set");
13727                 return (-1);
13728         }
13729
13730         if (dof->dofh_secsize == 0) {
13731                 dtrace_dof_error(dof, "zero section header size");
13732                 return (-1);
13733         }
13734
13735         /*
13736          * Check that the section headers don't exceed the amount of DOF
13737          * data.  Note that we cast the section size and number of sections
13738          * to uint64_t's to prevent possible overflow in the multiplication.
13739          */
13740         seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13741
13742         if (dof->dofh_secoff > len || seclen > len ||
13743             dof->dofh_secoff + seclen > len) {
13744                 dtrace_dof_error(dof, "truncated section headers");
13745                 return (-1);
13746         }
13747
13748         if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13749                 dtrace_dof_error(dof, "misaligned section headers");
13750                 return (-1);
13751         }
13752
13753         if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13754                 dtrace_dof_error(dof, "misaligned section size");
13755                 return (-1);
13756         }
13757
13758         /*
13759          * Take an initial pass through the section headers to be sure that
13760          * the headers don't have stray offsets.  If the 'noprobes' flag is
13761          * set, do not permit sections relating to providers, probes, or args.
13762          */
13763         for (i = 0; i < dof->dofh_secnum; i++) {
13764                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13765                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13766
13767                 if (noprobes) {
13768                         switch (sec->dofs_type) {
13769                         case DOF_SECT_PROVIDER:
13770                         case DOF_SECT_PROBES:
13771                         case DOF_SECT_PRARGS:
13772                         case DOF_SECT_PROFFS:
13773                                 dtrace_dof_error(dof, "illegal sections "
13774                                     "for enabling");
13775                                 return (-1);
13776                         }
13777                 }
13778
13779                 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
13780                     !(sec->dofs_flags & DOF_SECF_LOAD)) {
13781                         dtrace_dof_error(dof, "loadable section with load "
13782                             "flag unset");
13783                         return (-1);
13784                 }
13785
13786                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13787                         continue; /* just ignore non-loadable sections */
13788
13789                 if (!ISP2(sec->dofs_align)) {
13790                         dtrace_dof_error(dof, "bad section alignment");
13791                         return (-1);
13792                 }
13793
13794                 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13795                         dtrace_dof_error(dof, "misaligned section");
13796                         return (-1);
13797                 }
13798
13799                 if (sec->dofs_offset > len || sec->dofs_size > len ||
13800                     sec->dofs_offset + sec->dofs_size > len) {
13801                         dtrace_dof_error(dof, "corrupt section header");
13802                         return (-1);
13803                 }
13804
13805                 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13806                     sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13807                         dtrace_dof_error(dof, "non-terminating string table");
13808                         return (-1);
13809                 }
13810         }
13811
13812         /*
13813          * Take a second pass through the sections and locate and perform any
13814          * relocations that are present.  We do this after the first pass to
13815          * be sure that all sections have had their headers validated.
13816          */
13817         for (i = 0; i < dof->dofh_secnum; i++) {
13818                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13819                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13820
13821                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13822                         continue; /* skip sections that are not loadable */
13823
13824                 switch (sec->dofs_type) {
13825                 case DOF_SECT_URELHDR:
13826                         if (dtrace_dof_relocate(dof, sec, ubase) != 0)
13827                                 return (-1);
13828                         break;
13829                 }
13830         }
13831
13832         if ((enab = *enabp) == NULL)
13833                 enab = *enabp = dtrace_enabling_create(vstate);
13834
13835         for (i = 0; i < dof->dofh_secnum; i++) {
13836                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13837                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13838
13839                 if (sec->dofs_type != DOF_SECT_ECBDESC)
13840                         continue;
13841
13842                 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
13843                         dtrace_enabling_destroy(enab);
13844                         *enabp = NULL;
13845                         return (-1);
13846                 }
13847
13848                 dtrace_enabling_add(enab, ep);
13849         }
13850
13851         return (0);
13852 }
13853
13854 /*
13855  * Process DOF for any options.  This routine assumes that the DOF has been
13856  * at least processed by dtrace_dof_slurp().
13857  */
13858 static int
13859 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13860 {
13861         int i, rval;
13862         uint32_t entsize;
13863         size_t offs;
13864         dof_optdesc_t *desc;
13865
13866         for (i = 0; i < dof->dofh_secnum; i++) {
13867                 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13868                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13869
13870                 if (sec->dofs_type != DOF_SECT_OPTDESC)
13871                         continue;
13872
13873                 if (sec->dofs_align != sizeof (uint64_t)) {
13874                         dtrace_dof_error(dof, "bad alignment in "
13875                             "option description");
13876                         return (EINVAL);
13877                 }
13878
13879                 if ((entsize = sec->dofs_entsize) == 0) {
13880                         dtrace_dof_error(dof, "zeroed option entry size");
13881                         return (EINVAL);
13882                 }
13883
13884                 if (entsize < sizeof (dof_optdesc_t)) {
13885                         dtrace_dof_error(dof, "bad option entry size");
13886                         return (EINVAL);
13887                 }
13888
13889                 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13890                         desc = (dof_optdesc_t *)((uintptr_t)dof +
13891                             (uintptr_t)sec->dofs_offset + offs);
13892
13893                         if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13894                                 dtrace_dof_error(dof, "non-zero option string");
13895                                 return (EINVAL);
13896                         }
13897
13898                         if (desc->dofo_value == DTRACEOPT_UNSET) {
13899                                 dtrace_dof_error(dof, "unset option");
13900                                 return (EINVAL);
13901                         }
13902
13903                         if ((rval = dtrace_state_option(state,
13904                             desc->dofo_option, desc->dofo_value)) != 0) {
13905                                 dtrace_dof_error(dof, "rejected option");
13906                                 return (rval);
13907                         }
13908                 }
13909         }
13910
13911         return (0);
13912 }
13913
13914 /*
13915  * DTrace Consumer State Functions
13916  */
13917 static int
13918 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13919 {
13920         size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
13921         void *base;
13922         uintptr_t limit;
13923         dtrace_dynvar_t *dvar, *next, *start;
13924         int i;
13925
13926         ASSERT(MUTEX_HELD(&dtrace_lock));
13927         ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13928
13929         bzero(dstate, sizeof (dtrace_dstate_t));
13930
13931         if ((dstate->dtds_chunksize = chunksize) == 0)
13932                 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13933
13934         if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13935                 size = min;
13936
13937         if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
13938                 return (ENOMEM);
13939
13940         dstate->dtds_size = size;
13941         dstate->dtds_base = base;
13942         dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13943         bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
13944
13945         hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13946
13947         if (hashsize != 1 && (hashsize & 1))
13948                 hashsize--;
13949
13950         dstate->dtds_hashsize = hashsize;
13951         dstate->dtds_hash = dstate->dtds_base;
13952
13953         /*
13954          * Set all of our hash buckets to point to the single sink, and (if
13955          * it hasn't already been set), set the sink's hash value to be the
13956          * sink sentinel value.  The sink is needed for dynamic variable
13957          * lookups to know that they have iterated over an entire, valid hash
13958          * chain.
13959          */
13960         for (i = 0; i < hashsize; i++)
13961                 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13962
13963         if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13964                 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13965
13966         /*
13967          * Determine number of active CPUs.  Divide free list evenly among
13968          * active CPUs.
13969          */
13970         start = (dtrace_dynvar_t *)
13971             ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13972         limit = (uintptr_t)base + size;
13973
13974         maxper = (limit - (uintptr_t)start) / NCPU;
13975         maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13976
13977 #if !defined(sun)
13978         CPU_FOREACH(i) {
13979 #else
13980         for (i = 0; i < NCPU; i++) {
13981 #endif
13982                 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13983
13984                 /*
13985                  * If we don't even have enough chunks to make it once through
13986                  * NCPUs, we're just going to allocate everything to the first
13987                  * CPU.  And if we're on the last CPU, we're going to allocate
13988                  * whatever is left over.  In either case, we set the limit to
13989                  * be the limit of the dynamic variable space.
13990                  */
13991                 if (maxper == 0 || i == NCPU - 1) {
13992                         limit = (uintptr_t)base + size;
13993                         start = NULL;
13994                 } else {
13995                         limit = (uintptr_t)start + maxper;
13996                         start = (dtrace_dynvar_t *)limit;
13997                 }
13998
13999                 ASSERT(limit <= (uintptr_t)base + size);
14000
14001                 for (;;) {
14002                         next = (dtrace_dynvar_t *)((uintptr_t)dvar +
14003                             dstate->dtds_chunksize);
14004
14005                         if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
14006                                 break;
14007
14008                         dvar->dtdv_next = next;
14009                         dvar = next;
14010                 }
14011
14012                 if (maxper == 0)
14013                         break;
14014         }
14015
14016         return (0);
14017 }
14018
14019 static void
14020 dtrace_dstate_fini(dtrace_dstate_t *dstate)
14021 {
14022         ASSERT(MUTEX_HELD(&cpu_lock));
14023
14024         if (dstate->dtds_base == NULL)
14025                 return;
14026
14027         kmem_free(dstate->dtds_base, dstate->dtds_size);
14028         kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
14029 }
14030
14031 static void
14032 dtrace_vstate_fini(dtrace_vstate_t *vstate)
14033 {
14034         /*
14035          * Logical XOR, where are you?
14036          */
14037         ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
14038
14039         if (vstate->dtvs_nglobals > 0) {
14040                 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14041                     sizeof (dtrace_statvar_t *));
14042         }
14043
14044         if (vstate->dtvs_ntlocals > 0) {
14045                 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14046                     sizeof (dtrace_difv_t));
14047         }
14048
14049         ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
14050
14051         if (vstate->dtvs_nlocals > 0) {
14052                 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14053                     sizeof (dtrace_statvar_t *));
14054         }
14055 }
14056
14057 #if defined(sun)
14058 static void
14059 dtrace_state_clean(dtrace_state_t *state)
14060 {
14061         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14062                 return;
14063
14064         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14065         dtrace_speculation_clean(state);
14066 }
14067
14068 static void
14069 dtrace_state_deadman(dtrace_state_t *state)
14070 {
14071         hrtime_t now;
14072
14073         dtrace_sync();
14074
14075         now = dtrace_gethrtime();
14076
14077         if (state != dtrace_anon.dta_state &&
14078             now - state->dts_laststatus >= dtrace_deadman_user)
14079                 return;
14080
14081         /*
14082          * We must be sure that dts_alive never appears to be less than the
14083          * value upon entry to dtrace_state_deadman(), and because we lack a
14084          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14085          * store INT64_MAX to it, followed by a memory barrier, followed by
14086          * the new value.  This assures that dts_alive never appears to be
14087          * less than its true value, regardless of the order in which the
14088          * stores to the underlying storage are issued.
14089          */
14090         state->dts_alive = INT64_MAX;
14091         dtrace_membar_producer();
14092         state->dts_alive = now;
14093 }
14094 #else
14095 static void
14096 dtrace_state_clean(void *arg)
14097 {
14098         dtrace_state_t *state = arg;
14099         dtrace_optval_t *opt = state->dts_options;
14100
14101         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14102                 return;
14103
14104         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14105         dtrace_speculation_clean(state);
14106
14107         callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14108             dtrace_state_clean, state);
14109 }
14110
14111 static void
14112 dtrace_state_deadman(void *arg)
14113 {
14114         dtrace_state_t *state = arg;
14115         hrtime_t now;
14116
14117         dtrace_sync();
14118
14119         dtrace_debug_output();
14120
14121         now = dtrace_gethrtime();
14122
14123         if (state != dtrace_anon.dta_state &&
14124             now - state->dts_laststatus >= dtrace_deadman_user)
14125                 return;
14126
14127         /*
14128          * We must be sure that dts_alive never appears to be less than the
14129          * value upon entry to dtrace_state_deadman(), and because we lack a
14130          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14131          * store INT64_MAX to it, followed by a memory barrier, followed by
14132          * the new value.  This assures that dts_alive never appears to be
14133          * less than its true value, regardless of the order in which the
14134          * stores to the underlying storage are issued.
14135          */
14136         state->dts_alive = INT64_MAX;
14137         dtrace_membar_producer();
14138         state->dts_alive = now;
14139
14140         callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14141             dtrace_state_deadman, state);
14142 }
14143 #endif
14144
14145 static dtrace_state_t *
14146 #if defined(sun)
14147 dtrace_state_create(dev_t *devp, cred_t *cr)
14148 #else
14149 dtrace_state_create(struct cdev *dev)
14150 #endif
14151 {
14152 #if defined(sun)
14153         minor_t minor;
14154         major_t major;
14155 #else
14156         cred_t *cr = NULL;
14157         int m = 0;
14158 #endif
14159         char c[30];
14160         dtrace_state_t *state;
14161         dtrace_optval_t *opt;
14162         int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
14163
14164         ASSERT(MUTEX_HELD(&dtrace_lock));
14165         ASSERT(MUTEX_HELD(&cpu_lock));
14166
14167 #if defined(sun)
14168         minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
14169             VM_BESTFIT | VM_SLEEP);
14170
14171         if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
14172                 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14173                 return (NULL);
14174         }
14175
14176         state = ddi_get_soft_state(dtrace_softstate, minor);
14177 #else
14178         if (dev != NULL) {
14179                 cr = dev->si_cred;
14180                 m = dev2unit(dev);
14181                 }
14182
14183         /* Allocate memory for the state. */
14184         state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
14185 #endif
14186
14187         state->dts_epid = DTRACE_EPIDNONE + 1;
14188
14189         (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
14190 #if defined(sun)
14191         state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14192             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14193
14194         if (devp != NULL) {
14195                 major = getemajor(*devp);
14196         } else {
14197                 major = ddi_driver_major(dtrace_devi);
14198         }
14199
14200         state->dts_dev = makedevice(major, minor);
14201
14202         if (devp != NULL)
14203                 *devp = state->dts_dev;
14204 #else
14205         state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
14206         state->dts_dev = dev;
14207 #endif
14208
14209         /*
14210          * We allocate NCPU buffers.  On the one hand, this can be quite
14211          * a bit of memory per instance (nearly 36K on a Starcat).  On the
14212          * other hand, it saves an additional memory reference in the probe
14213          * path.
14214          */
14215         state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14216         state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14217
14218 #if defined(sun)
14219         state->dts_cleaner = CYCLIC_NONE;
14220         state->dts_deadman = CYCLIC_NONE;
14221 #else
14222         callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
14223         callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
14224 #endif
14225         state->dts_vstate.dtvs_state = state;
14226
14227         for (i = 0; i < DTRACEOPT_MAX; i++)
14228                 state->dts_options[i] = DTRACEOPT_UNSET;
14229
14230         /*
14231          * Set the default options.
14232          */
14233         opt = state->dts_options;
14234         opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14235         opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14236         opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14237         opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14238         opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14239         opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14240         opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14241         opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14242         opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14243         opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14244         opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14245         opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14246         opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14247         opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14248
14249         state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
14250
14251         /*
14252          * Depending on the user credentials, we set flag bits which alter probe
14253          * visibility or the amount of destructiveness allowed.  In the case of
14254          * actual anonymous tracing, or the possession of all privileges, all of
14255          * the normal checks are bypassed.
14256          */
14257         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14258                 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14259                 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14260         } else {
14261                 /*
14262                  * Set up the credentials for this instantiation.  We take a
14263                  * hold on the credential to prevent it from disappearing on
14264                  * us; this in turn prevents the zone_t referenced by this
14265                  * credential from disappearing.  This means that we can
14266                  * examine the credential and the zone from probe context.
14267                  */
14268                 crhold(cr);
14269                 state->dts_cred.dcr_cred = cr;
14270
14271                 /*
14272                  * CRA_PROC means "we have *some* privilege for dtrace" and
14273                  * unlocks the use of variables like pid, zonename, etc.
14274                  */
14275                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14276                     PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14277                         state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14278                 }
14279
14280                 /*
14281                  * dtrace_user allows use of syscall and profile providers.
14282                  * If the user also has proc_owner and/or proc_zone, we
14283                  * extend the scope to include additional visibility and
14284                  * destructive power.
14285                  */
14286                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14287                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14288                                 state->dts_cred.dcr_visible |=
14289                                     DTRACE_CRV_ALLPROC;
14290
14291                                 state->dts_cred.dcr_action |=
14292                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14293                         }
14294
14295                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14296                                 state->dts_cred.dcr_visible |=
14297                                     DTRACE_CRV_ALLZONE;
14298
14299                                 state->dts_cred.dcr_action |=
14300                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14301                         }
14302
14303                         /*
14304                          * If we have all privs in whatever zone this is,
14305                          * we can do destructive things to processes which
14306                          * have altered credentials.
14307                          */
14308 #if defined(sun)
14309                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14310                             cr->cr_zone->zone_privset)) {
14311                                 state->dts_cred.dcr_action |=
14312                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14313                         }
14314 #endif
14315                 }
14316
14317                 /*
14318                  * Holding the dtrace_kernel privilege also implies that
14319                  * the user has the dtrace_user privilege from a visibility
14320                  * perspective.  But without further privileges, some
14321                  * destructive actions are not available.
14322                  */
14323                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14324                         /*
14325                          * Make all probes in all zones visible.  However,
14326                          * this doesn't mean that all actions become available
14327                          * to all zones.
14328                          */
14329                         state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14330                             DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14331
14332                         state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14333                             DTRACE_CRA_PROC;
14334                         /*
14335                          * Holding proc_owner means that destructive actions
14336                          * for *this* zone are allowed.
14337                          */
14338                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14339                                 state->dts_cred.dcr_action |=
14340                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14341
14342                         /*
14343                          * Holding proc_zone means that destructive actions
14344                          * for this user/group ID in all zones is allowed.
14345                          */
14346                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14347                                 state->dts_cred.dcr_action |=
14348                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14349
14350 #if defined(sun)
14351                         /*
14352                          * If we have all privs in whatever zone this is,
14353                          * we can do destructive things to processes which
14354                          * have altered credentials.
14355                          */
14356                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14357                             cr->cr_zone->zone_privset)) {
14358                                 state->dts_cred.dcr_action |=
14359                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14360                         }
14361 #endif
14362                 }
14363
14364                 /*
14365                  * Holding the dtrace_proc privilege gives control over fasttrap
14366                  * and pid providers.  We need to grant wider destructive
14367                  * privileges in the event that the user has proc_owner and/or
14368                  * proc_zone.
14369                  */
14370                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14371                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14372                                 state->dts_cred.dcr_action |=
14373                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14374
14375                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14376                                 state->dts_cred.dcr_action |=
14377                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14378                 }
14379         }
14380
14381         return (state);
14382 }
14383
14384 static int
14385 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14386 {
14387         dtrace_optval_t *opt = state->dts_options, size;
14388         processorid_t cpu = 0;;
14389         int flags = 0, rval, factor, divisor = 1;
14390
14391         ASSERT(MUTEX_HELD(&dtrace_lock));
14392         ASSERT(MUTEX_HELD(&cpu_lock));
14393         ASSERT(which < DTRACEOPT_MAX);
14394         ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14395             (state == dtrace_anon.dta_state &&
14396             state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14397
14398         if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14399                 return (0);
14400
14401         if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14402                 cpu = opt[DTRACEOPT_CPU];
14403
14404         if (which == DTRACEOPT_SPECSIZE)
14405                 flags |= DTRACEBUF_NOSWITCH;
14406
14407         if (which == DTRACEOPT_BUFSIZE) {
14408                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14409                         flags |= DTRACEBUF_RING;
14410
14411                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14412                         flags |= DTRACEBUF_FILL;
14413
14414                 if (state != dtrace_anon.dta_state ||
14415                     state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14416                         flags |= DTRACEBUF_INACTIVE;
14417         }
14418
14419         for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
14420                 /*
14421                  * The size must be 8-byte aligned.  If the size is not 8-byte
14422                  * aligned, drop it down by the difference.
14423                  */
14424                 if (size & (sizeof (uint64_t) - 1))
14425                         size -= size & (sizeof (uint64_t) - 1);
14426
14427                 if (size < state->dts_reserve) {
14428                         /*
14429                          * Buffers always must be large enough to accommodate
14430                          * their prereserved space.  We return E2BIG instead
14431                          * of ENOMEM in this case to allow for user-level
14432                          * software to differentiate the cases.
14433                          */
14434                         return (E2BIG);
14435                 }
14436
14437                 rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
14438
14439                 if (rval != ENOMEM) {
14440                         opt[which] = size;
14441                         return (rval);
14442                 }
14443
14444                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14445                         return (rval);
14446
14447                 for (divisor = 2; divisor < factor; divisor <<= 1)
14448                         continue;
14449         }
14450
14451         return (ENOMEM);
14452 }
14453
14454 static int
14455 dtrace_state_buffers(dtrace_state_t *state)
14456 {
14457         dtrace_speculation_t *spec = state->dts_speculations;
14458         int rval, i;
14459
14460         if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14461             DTRACEOPT_BUFSIZE)) != 0)
14462                 return (rval);
14463
14464         if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14465             DTRACEOPT_AGGSIZE)) != 0)
14466                 return (rval);
14467
14468         for (i = 0; i < state->dts_nspeculations; i++) {
14469                 if ((rval = dtrace_state_buffer(state,
14470                     spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14471                         return (rval);
14472         }
14473
14474         return (0);
14475 }
14476
14477 static void
14478 dtrace_state_prereserve(dtrace_state_t *state)
14479 {
14480         dtrace_ecb_t *ecb;
14481         dtrace_probe_t *probe;
14482
14483         state->dts_reserve = 0;
14484
14485         if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14486                 return;
14487
14488         /*
14489          * If our buffer policy is a "fill" buffer policy, we need to set the
14490          * prereserved space to be the space required by the END probes.
14491          */
14492         probe = dtrace_probes[dtrace_probeid_end - 1];
14493         ASSERT(probe != NULL);
14494
14495         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14496                 if (ecb->dte_state != state)
14497                         continue;
14498
14499                 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14500         }
14501 }
14502
14503 static int
14504 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14505 {
14506         dtrace_optval_t *opt = state->dts_options, sz, nspec;
14507         dtrace_speculation_t *spec;
14508         dtrace_buffer_t *buf;
14509 #if defined(sun)
14510         cyc_handler_t hdlr;
14511         cyc_time_t when;
14512 #endif
14513         int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14514         dtrace_icookie_t cookie;
14515
14516         mutex_enter(&cpu_lock);
14517         mutex_enter(&dtrace_lock);
14518
14519         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14520                 rval = EBUSY;
14521                 goto out;
14522         }
14523
14524         /*
14525          * Before we can perform any checks, we must prime all of the
14526          * retained enablings that correspond to this state.
14527          */
14528         dtrace_enabling_prime(state);
14529
14530         if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14531                 rval = EACCES;
14532                 goto out;
14533         }
14534
14535         dtrace_state_prereserve(state);
14536
14537         /*
14538          * Now we want to do is try to allocate our speculations.
14539          * We do not automatically resize the number of speculations; if
14540          * this fails, we will fail the operation.
14541          */
14542         nspec = opt[DTRACEOPT_NSPEC];
14543         ASSERT(nspec != DTRACEOPT_UNSET);
14544
14545         if (nspec > INT_MAX) {
14546                 rval = ENOMEM;
14547                 goto out;
14548         }
14549
14550         spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
14551             KM_NOSLEEP | KM_NORMALPRI);
14552
14553         if (spec == NULL) {
14554                 rval = ENOMEM;
14555                 goto out;
14556         }
14557
14558         state->dts_speculations = spec;
14559         state->dts_nspeculations = (int)nspec;
14560
14561         for (i = 0; i < nspec; i++) {
14562                 if ((buf = kmem_zalloc(bufsize,
14563                     KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
14564                         rval = ENOMEM;
14565                         goto err;
14566                 }
14567
14568                 spec[i].dtsp_buffer = buf;
14569         }
14570
14571         if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14572                 if (dtrace_anon.dta_state == NULL) {
14573                         rval = ENOENT;
14574                         goto out;
14575                 }
14576
14577                 if (state->dts_necbs != 0) {
14578                         rval = EALREADY;
14579                         goto out;
14580                 }
14581
14582                 state->dts_anon = dtrace_anon_grab();
14583                 ASSERT(state->dts_anon != NULL);
14584                 state = state->dts_anon;
14585
14586                 /*
14587                  * We want "grabanon" to be set in the grabbed state, so we'll
14588                  * copy that option value from the grabbing state into the
14589                  * grabbed state.
14590                  */
14591                 state->dts_options[DTRACEOPT_GRABANON] =
14592                     opt[DTRACEOPT_GRABANON];
14593
14594                 *cpu = dtrace_anon.dta_beganon;
14595
14596                 /*
14597                  * If the anonymous state is active (as it almost certainly
14598                  * is if the anonymous enabling ultimately matched anything),
14599                  * we don't allow any further option processing -- but we
14600                  * don't return failure.
14601                  */
14602                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14603                         goto out;
14604         }
14605
14606         if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14607             opt[DTRACEOPT_AGGSIZE] != 0) {
14608                 if (state->dts_aggregations == NULL) {
14609                         /*
14610                          * We're not going to create an aggregation buffer
14611                          * because we don't have any ECBs that contain
14612                          * aggregations -- set this option to 0.
14613                          */
14614                         opt[DTRACEOPT_AGGSIZE] = 0;
14615                 } else {
14616                         /*
14617                          * If we have an aggregation buffer, we must also have
14618                          * a buffer to use as scratch.
14619                          */
14620                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14621                             opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14622                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14623                         }
14624                 }
14625         }
14626
14627         if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14628             opt[DTRACEOPT_SPECSIZE] != 0) {
14629                 if (!state->dts_speculates) {
14630                         /*
14631                          * We're not going to create speculation buffers
14632                          * because we don't have any ECBs that actually
14633                          * speculate -- set the speculation size to 0.
14634                          */
14635                         opt[DTRACEOPT_SPECSIZE] = 0;
14636                 }
14637         }
14638
14639         /*
14640          * The bare minimum size for any buffer that we're actually going to
14641          * do anything to is sizeof (uint64_t).
14642          */
14643         sz = sizeof (uint64_t);
14644
14645         if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14646             (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14647             (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14648                 /*
14649                  * A buffer size has been explicitly set to 0 (or to a size
14650                  * that will be adjusted to 0) and we need the space -- we
14651                  * need to return failure.  We return ENOSPC to differentiate
14652                  * it from failing to allocate a buffer due to failure to meet
14653                  * the reserve (for which we return E2BIG).
14654                  */
14655                 rval = ENOSPC;
14656                 goto out;
14657         }
14658
14659         if ((rval = dtrace_state_buffers(state)) != 0)
14660                 goto err;
14661
14662         if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14663                 sz = dtrace_dstate_defsize;
14664
14665         do {
14666                 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14667
14668                 if (rval == 0)
14669                         break;
14670
14671                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14672                         goto err;
14673         } while (sz >>= 1);
14674
14675         opt[DTRACEOPT_DYNVARSIZE] = sz;
14676
14677         if (rval != 0)
14678                 goto err;
14679
14680         if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14681                 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14682
14683         if (opt[DTRACEOPT_CLEANRATE] == 0)
14684                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14685
14686         if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14687                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14688
14689         if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14690                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14691
14692         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14693 #if defined(sun)
14694         hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14695         hdlr.cyh_arg = state;
14696         hdlr.cyh_level = CY_LOW_LEVEL;
14697
14698         when.cyt_when = 0;
14699         when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14700
14701         state->dts_cleaner = cyclic_add(&hdlr, &when);
14702
14703         hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14704         hdlr.cyh_arg = state;
14705         hdlr.cyh_level = CY_LOW_LEVEL;
14706
14707         when.cyt_when = 0;
14708         when.cyt_interval = dtrace_deadman_interval;
14709
14710         state->dts_deadman = cyclic_add(&hdlr, &when);
14711 #else
14712         callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14713             dtrace_state_clean, state);
14714         callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14715             dtrace_state_deadman, state);
14716 #endif
14717
14718         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14719
14720 #if defined(sun)
14721         if (state->dts_getf != 0 &&
14722             !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14723                 /*
14724                  * We don't have kernel privs but we have at least one call
14725                  * to getf(); we need to bump our zone's count, and (if
14726                  * this is the first enabling to have an unprivileged call
14727                  * to getf()) we need to hook into closef().
14728                  */
14729                 state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
14730
14731                 if (dtrace_getf++ == 0) {
14732                         ASSERT(dtrace_closef == NULL);
14733                         dtrace_closef = dtrace_getf_barrier;
14734                 }
14735         }
14736 #endif
14737
14738         /*
14739          * Now it's time to actually fire the BEGIN probe.  We need to disable
14740          * interrupts here both to record the CPU on which we fired the BEGIN
14741          * probe (the data from this CPU will be processed first at user
14742          * level) and to manually activate the buffer for this CPU.
14743          */
14744         cookie = dtrace_interrupt_disable();
14745         *cpu = curcpu;
14746         ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14747         state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14748
14749         dtrace_probe(dtrace_probeid_begin,
14750             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14751         dtrace_interrupt_enable(cookie);
14752         /*
14753          * We may have had an exit action from a BEGIN probe; only change our
14754          * state to ACTIVE if we're still in WARMUP.
14755          */
14756         ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14757             state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14758
14759         if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14760                 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14761
14762         /*
14763          * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14764          * want each CPU to transition its principal buffer out of the
14765          * INACTIVE state.  Doing this assures that no CPU will suddenly begin
14766          * processing an ECB halfway down a probe's ECB chain; all CPUs will
14767          * atomically transition from processing none of a state's ECBs to
14768          * processing all of them.
14769          */
14770         dtrace_xcall(DTRACE_CPUALL,
14771             (dtrace_xcall_t)dtrace_buffer_activate, state);
14772         goto out;
14773
14774 err:
14775         dtrace_buffer_free(state->dts_buffer);
14776         dtrace_buffer_free(state->dts_aggbuffer);
14777
14778         if ((nspec = state->dts_nspeculations) == 0) {
14779                 ASSERT(state->dts_speculations == NULL);
14780                 goto out;
14781         }
14782
14783         spec = state->dts_speculations;
14784         ASSERT(spec != NULL);
14785
14786         for (i = 0; i < state->dts_nspeculations; i++) {
14787                 if ((buf = spec[i].dtsp_buffer) == NULL)
14788                         break;
14789
14790                 dtrace_buffer_free(buf);
14791                 kmem_free(buf, bufsize);
14792         }
14793
14794         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14795         state->dts_nspeculations = 0;
14796         state->dts_speculations = NULL;
14797
14798 out:
14799         mutex_exit(&dtrace_lock);
14800         mutex_exit(&cpu_lock);
14801
14802         return (rval);
14803 }
14804
14805 static int
14806 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14807 {
14808         dtrace_icookie_t cookie;
14809
14810         ASSERT(MUTEX_HELD(&dtrace_lock));
14811
14812         if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14813             state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14814                 return (EINVAL);
14815
14816         /*
14817          * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14818          * to be sure that every CPU has seen it.  See below for the details
14819          * on why this is done.
14820          */
14821         state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14822         dtrace_sync();
14823
14824         /*
14825          * By this point, it is impossible for any CPU to be still processing
14826          * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
14827          * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14828          * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
14829          * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14830          * iff we're in the END probe.
14831          */
14832         state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14833         dtrace_sync();
14834         ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14835
14836         /*
14837          * Finally, we can release the reserve and call the END probe.  We
14838          * disable interrupts across calling the END probe to allow us to
14839          * return the CPU on which we actually called the END probe.  This
14840          * allows user-land to be sure that this CPU's principal buffer is
14841          * processed last.
14842          */
14843         state->dts_reserve = 0;
14844
14845         cookie = dtrace_interrupt_disable();
14846         *cpu = curcpu;
14847         dtrace_probe(dtrace_probeid_end,
14848             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14849         dtrace_interrupt_enable(cookie);
14850
14851         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14852         dtrace_sync();
14853
14854 #if defined(sun)
14855         if (state->dts_getf != 0 &&
14856             !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14857                 /*
14858                  * We don't have kernel privs but we have at least one call
14859                  * to getf(); we need to lower our zone's count, and (if
14860                  * this is the last enabling to have an unprivileged call
14861                  * to getf()) we need to clear the closef() hook.
14862                  */
14863                 ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
14864                 ASSERT(dtrace_closef == dtrace_getf_barrier);
14865                 ASSERT(dtrace_getf > 0);
14866
14867                 state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
14868
14869                 if (--dtrace_getf == 0)
14870                         dtrace_closef = NULL;
14871         }
14872 #endif
14873
14874         return (0);
14875 }
14876
14877 static int
14878 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14879     dtrace_optval_t val)
14880 {
14881         ASSERT(MUTEX_HELD(&dtrace_lock));
14882
14883         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14884                 return (EBUSY);
14885
14886         if (option >= DTRACEOPT_MAX)
14887                 return (EINVAL);
14888
14889         if (option != DTRACEOPT_CPU && val < 0)
14890                 return (EINVAL);
14891
14892         switch (option) {
14893         case DTRACEOPT_DESTRUCTIVE:
14894                 if (dtrace_destructive_disallow)
14895                         return (EACCES);
14896
14897                 state->dts_cred.dcr_destructive = 1;
14898                 break;
14899
14900         case DTRACEOPT_BUFSIZE:
14901         case DTRACEOPT_DYNVARSIZE:
14902         case DTRACEOPT_AGGSIZE:
14903         case DTRACEOPT_SPECSIZE:
14904         case DTRACEOPT_STRSIZE:
14905                 if (val < 0)
14906                         return (EINVAL);
14907
14908                 if (val >= LONG_MAX) {
14909                         /*
14910                          * If this is an otherwise negative value, set it to
14911                          * the highest multiple of 128m less than LONG_MAX.
14912                          * Technically, we're adjusting the size without
14913                          * regard to the buffer resizing policy, but in fact,
14914                          * this has no effect -- if we set the buffer size to
14915                          * ~LONG_MAX and the buffer policy is ultimately set to
14916                          * be "manual", the buffer allocation is guaranteed to
14917                          * fail, if only because the allocation requires two
14918                          * buffers.  (We set the the size to the highest
14919                          * multiple of 128m because it ensures that the size
14920                          * will remain a multiple of a megabyte when
14921                          * repeatedly halved -- all the way down to 15m.)
14922                          */
14923                         val = LONG_MAX - (1 << 27) + 1;
14924                 }
14925         }
14926
14927         state->dts_options[option] = val;
14928
14929         return (0);
14930 }
14931
14932 static void
14933 dtrace_state_destroy(dtrace_state_t *state)
14934 {
14935         dtrace_ecb_t *ecb;
14936         dtrace_vstate_t *vstate = &state->dts_vstate;
14937 #if defined(sun)
14938         minor_t minor = getminor(state->dts_dev);
14939 #endif
14940         int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14941         dtrace_speculation_t *spec = state->dts_speculations;
14942         int nspec = state->dts_nspeculations;
14943         uint32_t match;
14944
14945         ASSERT(MUTEX_HELD(&dtrace_lock));
14946         ASSERT(MUTEX_HELD(&cpu_lock));
14947
14948         /*
14949          * First, retract any retained enablings for this state.
14950          */
14951         dtrace_enabling_retract(state);
14952         ASSERT(state->dts_nretained == 0);
14953
14954         if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14955             state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14956                 /*
14957                  * We have managed to come into dtrace_state_destroy() on a
14958                  * hot enabling -- almost certainly because of a disorderly
14959                  * shutdown of a consumer.  (That is, a consumer that is
14960                  * exiting without having called dtrace_stop().) In this case,
14961                  * we're going to set our activity to be KILLED, and then
14962                  * issue a sync to be sure that everyone is out of probe
14963                  * context before we start blowing away ECBs.
14964                  */
14965                 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14966                 dtrace_sync();
14967         }
14968
14969         /*
14970          * Release the credential hold we took in dtrace_state_create().
14971          */
14972         if (state->dts_cred.dcr_cred != NULL)
14973                 crfree(state->dts_cred.dcr_cred);
14974
14975         /*
14976          * Now we can safely disable and destroy any enabled probes.  Because
14977          * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14978          * (especially if they're all enabled), we take two passes through the
14979          * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14980          * in the second we disable whatever is left over.
14981          */
14982         for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14983                 for (i = 0; i < state->dts_necbs; i++) {
14984                         if ((ecb = state->dts_ecbs[i]) == NULL)
14985                                 continue;
14986
14987                         if (match && ecb->dte_probe != NULL) {
14988                                 dtrace_probe_t *probe = ecb->dte_probe;
14989                                 dtrace_provider_t *prov = probe->dtpr_provider;
14990
14991                                 if (!(prov->dtpv_priv.dtpp_flags & match))
14992                                         continue;
14993                         }
14994
14995                         dtrace_ecb_disable(ecb);
14996                         dtrace_ecb_destroy(ecb);
14997                 }
14998
14999                 if (!match)
15000                         break;
15001         }
15002
15003         /*
15004          * Before we free the buffers, perform one more sync to assure that
15005          * every CPU is out of probe context.
15006          */
15007         dtrace_sync();
15008
15009         dtrace_buffer_free(state->dts_buffer);
15010         dtrace_buffer_free(state->dts_aggbuffer);
15011
15012         for (i = 0; i < nspec; i++)
15013                 dtrace_buffer_free(spec[i].dtsp_buffer);
15014
15015 #if defined(sun)
15016         if (state->dts_cleaner != CYCLIC_NONE)
15017                 cyclic_remove(state->dts_cleaner);
15018
15019         if (state->dts_deadman != CYCLIC_NONE)
15020                 cyclic_remove(state->dts_deadman);
15021 #else
15022         callout_stop(&state->dts_cleaner);
15023         callout_drain(&state->dts_cleaner);
15024         callout_stop(&state->dts_deadman);
15025         callout_drain(&state->dts_deadman);
15026 #endif
15027
15028         dtrace_dstate_fini(&vstate->dtvs_dynvars);
15029         dtrace_vstate_fini(vstate);
15030         if (state->dts_ecbs != NULL)
15031                 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
15032
15033         if (state->dts_aggregations != NULL) {
15034 #ifdef DEBUG
15035                 for (i = 0; i < state->dts_naggregations; i++)
15036                         ASSERT(state->dts_aggregations[i] == NULL);
15037 #endif
15038                 ASSERT(state->dts_naggregations > 0);
15039                 kmem_free(state->dts_aggregations,
15040                     state->dts_naggregations * sizeof (dtrace_aggregation_t *));
15041         }
15042
15043         kmem_free(state->dts_buffer, bufsize);
15044         kmem_free(state->dts_aggbuffer, bufsize);
15045
15046         for (i = 0; i < nspec; i++)
15047                 kmem_free(spec[i].dtsp_buffer, bufsize);
15048
15049         if (spec != NULL)
15050                 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
15051
15052         dtrace_format_destroy(state);
15053
15054         if (state->dts_aggid_arena != NULL) {
15055 #if defined(sun)
15056                 vmem_destroy(state->dts_aggid_arena);
15057 #else
15058                 delete_unrhdr(state->dts_aggid_arena);
15059 #endif
15060                 state->dts_aggid_arena = NULL;
15061         }
15062 #if defined(sun)
15063         ddi_soft_state_free(dtrace_softstate, minor);
15064         vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
15065 #endif
15066 }
15067
15068 /*
15069  * DTrace Anonymous Enabling Functions
15070  */
15071 static dtrace_state_t *
15072 dtrace_anon_grab(void)
15073 {
15074         dtrace_state_t *state;
15075
15076         ASSERT(MUTEX_HELD(&dtrace_lock));
15077
15078         if ((state = dtrace_anon.dta_state) == NULL) {
15079                 ASSERT(dtrace_anon.dta_enabling == NULL);
15080                 return (NULL);
15081         }
15082
15083         ASSERT(dtrace_anon.dta_enabling != NULL);
15084         ASSERT(dtrace_retained != NULL);
15085
15086         dtrace_enabling_destroy(dtrace_anon.dta_enabling);
15087         dtrace_anon.dta_enabling = NULL;
15088         dtrace_anon.dta_state = NULL;
15089
15090         return (state);
15091 }
15092
15093 static void
15094 dtrace_anon_property(void)
15095 {
15096         int i, rv;
15097         dtrace_state_t *state;
15098         dof_hdr_t *dof;
15099         char c[32];             /* enough for "dof-data-" + digits */
15100
15101         ASSERT(MUTEX_HELD(&dtrace_lock));
15102         ASSERT(MUTEX_HELD(&cpu_lock));
15103
15104         for (i = 0; ; i++) {
15105                 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
15106
15107                 dtrace_err_verbose = 1;
15108
15109                 if ((dof = dtrace_dof_property(c)) == NULL) {
15110                         dtrace_err_verbose = 0;
15111                         break;
15112                 }
15113
15114 #if defined(sun)
15115                 /*
15116                  * We want to create anonymous state, so we need to transition
15117                  * the kernel debugger to indicate that DTrace is active.  If
15118                  * this fails (e.g. because the debugger has modified text in
15119                  * some way), we won't continue with the processing.
15120                  */
15121                 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15122                         cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15123                             "enabling ignored.");
15124                         dtrace_dof_destroy(dof);
15125                         break;
15126                 }
15127 #endif
15128
15129                 /*
15130                  * If we haven't allocated an anonymous state, we'll do so now.
15131                  */
15132                 if ((state = dtrace_anon.dta_state) == NULL) {
15133 #if defined(sun)
15134                         state = dtrace_state_create(NULL, NULL);
15135 #else
15136                         state = dtrace_state_create(NULL);
15137 #endif
15138                         dtrace_anon.dta_state = state;
15139
15140                         if (state == NULL) {
15141                                 /*
15142                                  * This basically shouldn't happen:  the only
15143                                  * failure mode from dtrace_state_create() is a
15144                                  * failure of ddi_soft_state_zalloc() that
15145                                  * itself should never happen.  Still, the
15146                                  * interface allows for a failure mode, and
15147                                  * we want to fail as gracefully as possible:
15148                                  * we'll emit an error message and cease
15149                                  * processing anonymous state in this case.
15150                                  */
15151                                 cmn_err(CE_WARN, "failed to create "
15152                                     "anonymous state");
15153                                 dtrace_dof_destroy(dof);
15154                                 break;
15155                         }
15156                 }
15157
15158                 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15159                     &dtrace_anon.dta_enabling, 0, B_TRUE);
15160
15161                 if (rv == 0)
15162                         rv = dtrace_dof_options(dof, state);
15163
15164                 dtrace_err_verbose = 0;
15165                 dtrace_dof_destroy(dof);
15166
15167                 if (rv != 0) {
15168                         /*
15169                          * This is malformed DOF; chuck any anonymous state
15170                          * that we created.
15171                          */
15172                         ASSERT(dtrace_anon.dta_enabling == NULL);
15173                         dtrace_state_destroy(state);
15174                         dtrace_anon.dta_state = NULL;
15175                         break;
15176                 }
15177
15178                 ASSERT(dtrace_anon.dta_enabling != NULL);
15179         }
15180
15181         if (dtrace_anon.dta_enabling != NULL) {
15182                 int rval;
15183
15184                 /*
15185                  * dtrace_enabling_retain() can only fail because we are
15186                  * trying to retain more enablings than are allowed -- but
15187                  * we only have one anonymous enabling, and we are guaranteed
15188                  * to be allowed at least one retained enabling; we assert
15189                  * that dtrace_enabling_retain() returns success.
15190                  */
15191                 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15192                 ASSERT(rval == 0);
15193
15194                 dtrace_enabling_dump(dtrace_anon.dta_enabling);
15195         }
15196 }
15197
15198 /*
15199  * DTrace Helper Functions
15200  */
15201 static void
15202 dtrace_helper_trace(dtrace_helper_action_t *helper,
15203     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15204 {
15205         uint32_t size, next, nnext, i;
15206         dtrace_helptrace_t *ent;
15207         uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
15208
15209         if (!dtrace_helptrace_enabled)
15210                 return;
15211
15212         ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15213
15214         /*
15215          * What would a tracing framework be without its own tracing
15216          * framework?  (Well, a hell of a lot simpler, for starters...)
15217          */
15218         size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15219             sizeof (uint64_t) - sizeof (uint64_t);
15220
15221         /*
15222          * Iterate until we can allocate a slot in the trace buffer.
15223          */
15224         do {
15225                 next = dtrace_helptrace_next;
15226
15227                 if (next + size < dtrace_helptrace_bufsize) {
15228                         nnext = next + size;
15229                 } else {
15230                         nnext = size;
15231                 }
15232         } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15233
15234         /*
15235          * We have our slot; fill it in.
15236          */
15237         if (nnext == size)
15238                 next = 0;
15239
15240         ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15241         ent->dtht_helper = helper;
15242         ent->dtht_where = where;
15243         ent->dtht_nlocals = vstate->dtvs_nlocals;
15244
15245         ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15246             mstate->dtms_fltoffs : -1;
15247         ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15248         ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
15249
15250         for (i = 0; i < vstate->dtvs_nlocals; i++) {
15251                 dtrace_statvar_t *svar;
15252
15253                 if ((svar = vstate->dtvs_locals[i]) == NULL)
15254                         continue;
15255
15256                 ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
15257                 ent->dtht_locals[i] =
15258                     ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
15259         }
15260 }
15261
15262 static uint64_t
15263 dtrace_helper(int which, dtrace_mstate_t *mstate,
15264     dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15265 {
15266         uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
15267         uint64_t sarg0 = mstate->dtms_arg[0];
15268         uint64_t sarg1 = mstate->dtms_arg[1];
15269         uint64_t rval = 0;
15270         dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15271         dtrace_helper_action_t *helper;
15272         dtrace_vstate_t *vstate;
15273         dtrace_difo_t *pred;
15274         int i, trace = dtrace_helptrace_enabled;
15275
15276         ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15277
15278         if (helpers == NULL)
15279                 return (0);
15280
15281         if ((helper = helpers->dthps_actions[which]) == NULL)
15282                 return (0);
15283
15284         vstate = &helpers->dthps_vstate;
15285         mstate->dtms_arg[0] = arg0;
15286         mstate->dtms_arg[1] = arg1;
15287
15288         /*
15289          * Now iterate over each helper.  If its predicate evaluates to 'true',
15290          * we'll call the corresponding actions.  Note that the below calls
15291          * to dtrace_dif_emulate() may set faults in machine state.  This is
15292          * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
15293          * the stored DIF offset with its own (which is the desired behavior).
15294          * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15295          * from machine state; this is okay, too.
15296          */
15297         for (; helper != NULL; helper = helper->dtha_next) {
15298                 if ((pred = helper->dtha_predicate) != NULL) {
15299                         if (trace)
15300                                 dtrace_helper_trace(helper, mstate, vstate, 0);
15301
15302                         if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15303                                 goto next;
15304
15305                         if (*flags & CPU_DTRACE_FAULT)
15306                                 goto err;
15307                 }
15308
15309                 for (i = 0; i < helper->dtha_nactions; i++) {
15310                         if (trace)
15311                                 dtrace_helper_trace(helper,
15312                                     mstate, vstate, i + 1);
15313
15314                         rval = dtrace_dif_emulate(helper->dtha_actions[i],
15315                             mstate, vstate, state);
15316
15317                         if (*flags & CPU_DTRACE_FAULT)
15318                                 goto err;
15319                 }
15320
15321 next:
15322                 if (trace)
15323                         dtrace_helper_trace(helper, mstate, vstate,
15324                             DTRACE_HELPTRACE_NEXT);
15325         }
15326
15327         if (trace)
15328                 dtrace_helper_trace(helper, mstate, vstate,
15329                     DTRACE_HELPTRACE_DONE);
15330
15331         /*
15332          * Restore the arg0 that we saved upon entry.
15333          */
15334         mstate->dtms_arg[0] = sarg0;
15335         mstate->dtms_arg[1] = sarg1;
15336
15337         return (rval);
15338
15339 err:
15340         if (trace)
15341                 dtrace_helper_trace(helper, mstate, vstate,
15342                     DTRACE_HELPTRACE_ERR);
15343
15344         /*
15345          * Restore the arg0 that we saved upon entry.
15346          */
15347         mstate->dtms_arg[0] = sarg0;
15348         mstate->dtms_arg[1] = sarg1;
15349
15350         return (0);
15351 }
15352
15353 static void
15354 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15355     dtrace_vstate_t *vstate)
15356 {
15357         int i;
15358
15359         if (helper->dtha_predicate != NULL)
15360                 dtrace_difo_release(helper->dtha_predicate, vstate);
15361
15362         for (i = 0; i < helper->dtha_nactions; i++) {
15363                 ASSERT(helper->dtha_actions[i] != NULL);
15364                 dtrace_difo_release(helper->dtha_actions[i], vstate);
15365         }
15366
15367         kmem_free(helper->dtha_actions,
15368             helper->dtha_nactions * sizeof (dtrace_difo_t *));
15369         kmem_free(helper, sizeof (dtrace_helper_action_t));
15370 }
15371
15372 static int
15373 dtrace_helper_destroygen(int gen)
15374 {
15375         proc_t *p = curproc;
15376         dtrace_helpers_t *help = p->p_dtrace_helpers;
15377         dtrace_vstate_t *vstate;
15378         int i;
15379
15380         ASSERT(MUTEX_HELD(&dtrace_lock));
15381
15382         if (help == NULL || gen > help->dthps_generation)
15383                 return (EINVAL);
15384
15385         vstate = &help->dthps_vstate;
15386
15387         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15388                 dtrace_helper_action_t *last = NULL, *h, *next;
15389
15390                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15391                         next = h->dtha_next;
15392
15393                         if (h->dtha_generation == gen) {
15394                                 if (last != NULL) {
15395                                         last->dtha_next = next;
15396                                 } else {
15397                                         help->dthps_actions[i] = next;
15398                                 }
15399
15400                                 dtrace_helper_action_destroy(h, vstate);
15401                         } else {
15402                                 last = h;
15403                         }
15404                 }
15405         }
15406
15407         /*
15408          * Interate until we've cleared out all helper providers with the
15409          * given generation number.
15410          */
15411         for (;;) {
15412                 dtrace_helper_provider_t *prov;
15413
15414                 /*
15415                  * Look for a helper provider with the right generation. We
15416                  * have to start back at the beginning of the list each time
15417                  * because we drop dtrace_lock. It's unlikely that we'll make
15418                  * more than two passes.
15419                  */
15420                 for (i = 0; i < help->dthps_nprovs; i++) {
15421                         prov = help->dthps_provs[i];
15422
15423                         if (prov->dthp_generation == gen)
15424                                 break;
15425                 }
15426
15427                 /*
15428                  * If there were no matches, we're done.
15429                  */
15430                 if (i == help->dthps_nprovs)
15431                         break;
15432
15433                 /*
15434                  * Move the last helper provider into this slot.
15435                  */
15436                 help->dthps_nprovs--;
15437                 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15438                 help->dthps_provs[help->dthps_nprovs] = NULL;
15439
15440                 mutex_exit(&dtrace_lock);
15441
15442                 /*
15443                  * If we have a meta provider, remove this helper provider.
15444                  */
15445                 mutex_enter(&dtrace_meta_lock);
15446                 if (dtrace_meta_pid != NULL) {
15447                         ASSERT(dtrace_deferred_pid == NULL);
15448                         dtrace_helper_provider_remove(&prov->dthp_prov,
15449                             p->p_pid);
15450                 }
15451                 mutex_exit(&dtrace_meta_lock);
15452
15453                 dtrace_helper_provider_destroy(prov);
15454
15455                 mutex_enter(&dtrace_lock);
15456         }
15457
15458         return (0);
15459 }
15460
15461 static int
15462 dtrace_helper_validate(dtrace_helper_action_t *helper)
15463 {
15464         int err = 0, i;
15465         dtrace_difo_t *dp;
15466
15467         if ((dp = helper->dtha_predicate) != NULL)
15468                 err += dtrace_difo_validate_helper(dp);
15469
15470         for (i = 0; i < helper->dtha_nactions; i++)
15471                 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15472
15473         return (err == 0);
15474 }
15475
15476 static int
15477 dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
15478 {
15479         dtrace_helpers_t *help;
15480         dtrace_helper_action_t *helper, *last;
15481         dtrace_actdesc_t *act;
15482         dtrace_vstate_t *vstate;
15483         dtrace_predicate_t *pred;
15484         int count = 0, nactions = 0, i;
15485
15486         if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15487                 return (EINVAL);
15488
15489         help = curproc->p_dtrace_helpers;
15490         last = help->dthps_actions[which];
15491         vstate = &help->dthps_vstate;
15492
15493         for (count = 0; last != NULL; last = last->dtha_next) {
15494                 count++;
15495                 if (last->dtha_next == NULL)
15496                         break;
15497         }
15498
15499         /*
15500          * If we already have dtrace_helper_actions_max helper actions for this
15501          * helper action type, we'll refuse to add a new one.
15502          */
15503         if (count >= dtrace_helper_actions_max)
15504                 return (ENOSPC);
15505
15506         helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15507         helper->dtha_generation = help->dthps_generation;
15508
15509         if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15510                 ASSERT(pred->dtp_difo != NULL);
15511                 dtrace_difo_hold(pred->dtp_difo);
15512                 helper->dtha_predicate = pred->dtp_difo;
15513         }
15514
15515         for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15516                 if (act->dtad_kind != DTRACEACT_DIFEXPR)
15517                         goto err;
15518
15519                 if (act->dtad_difo == NULL)
15520                         goto err;
15521
15522                 nactions++;
15523         }
15524
15525         helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15526             (helper->dtha_nactions = nactions), KM_SLEEP);
15527
15528         for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15529                 dtrace_difo_hold(act->dtad_difo);
15530                 helper->dtha_actions[i++] = act->dtad_difo;
15531         }
15532
15533         if (!dtrace_helper_validate(helper))
15534                 goto err;
15535
15536         if (last == NULL) {
15537                 help->dthps_actions[which] = helper;
15538         } else {
15539                 last->dtha_next = helper;
15540         }
15541
15542         if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15543                 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15544                 dtrace_helptrace_next = 0;
15545         }
15546
15547         return (0);
15548 err:
15549         dtrace_helper_action_destroy(helper, vstate);
15550         return (EINVAL);
15551 }
15552
15553 static void
15554 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15555     dof_helper_t *dofhp)
15556 {
15557         ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
15558
15559         mutex_enter(&dtrace_meta_lock);
15560         mutex_enter(&dtrace_lock);
15561
15562         if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15563                 /*
15564                  * If the dtrace module is loaded but not attached, or if
15565                  * there aren't isn't a meta provider registered to deal with
15566                  * these provider descriptions, we need to postpone creating
15567                  * the actual providers until later.
15568                  */
15569
15570                 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15571                     dtrace_deferred_pid != help) {
15572                         help->dthps_deferred = 1;
15573                         help->dthps_pid = p->p_pid;
15574                         help->dthps_next = dtrace_deferred_pid;
15575                         help->dthps_prev = NULL;
15576                         if (dtrace_deferred_pid != NULL)
15577                                 dtrace_deferred_pid->dthps_prev = help;
15578                         dtrace_deferred_pid = help;
15579                 }
15580
15581                 mutex_exit(&dtrace_lock);
15582
15583         } else if (dofhp != NULL) {
15584                 /*
15585                  * If the dtrace module is loaded and we have a particular
15586                  * helper provider description, pass that off to the
15587                  * meta provider.
15588                  */
15589
15590                 mutex_exit(&dtrace_lock);
15591
15592                 dtrace_helper_provide(dofhp, p->p_pid);
15593
15594         } else {
15595                 /*
15596                  * Otherwise, just pass all the helper provider descriptions
15597                  * off to the meta provider.
15598                  */
15599
15600                 int i;
15601                 mutex_exit(&dtrace_lock);
15602
15603                 for (i = 0; i < help->dthps_nprovs; i++) {
15604                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15605                             p->p_pid);
15606                 }
15607         }
15608
15609         mutex_exit(&dtrace_meta_lock);
15610 }
15611
15612 static int
15613 dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
15614 {
15615         dtrace_helpers_t *help;
15616         dtrace_helper_provider_t *hprov, **tmp_provs;
15617         uint_t tmp_maxprovs, i;
15618
15619         ASSERT(MUTEX_HELD(&dtrace_lock));
15620
15621         help = curproc->p_dtrace_helpers;
15622         ASSERT(help != NULL);
15623
15624         /*
15625          * If we already have dtrace_helper_providers_max helper providers,
15626          * we're refuse to add a new one.
15627          */
15628         if (help->dthps_nprovs >= dtrace_helper_providers_max)
15629                 return (ENOSPC);
15630
15631         /*
15632          * Check to make sure this isn't a duplicate.
15633          */
15634         for (i = 0; i < help->dthps_nprovs; i++) {
15635                 if (dofhp->dofhp_dof ==
15636                     help->dthps_provs[i]->dthp_prov.dofhp_dof)
15637                         return (EALREADY);
15638         }
15639
15640         hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15641         hprov->dthp_prov = *dofhp;
15642         hprov->dthp_ref = 1;
15643         hprov->dthp_generation = gen;
15644
15645         /*
15646          * Allocate a bigger table for helper providers if it's already full.
15647          */
15648         if (help->dthps_maxprovs == help->dthps_nprovs) {
15649                 tmp_maxprovs = help->dthps_maxprovs;
15650                 tmp_provs = help->dthps_provs;
15651
15652                 if (help->dthps_maxprovs == 0)
15653                         help->dthps_maxprovs = 2;
15654                 else
15655                         help->dthps_maxprovs *= 2;
15656                 if (help->dthps_maxprovs > dtrace_helper_providers_max)
15657                         help->dthps_maxprovs = dtrace_helper_providers_max;
15658
15659                 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15660
15661                 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15662                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15663
15664                 if (tmp_provs != NULL) {
15665                         bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15666                             sizeof (dtrace_helper_provider_t *));
15667                         kmem_free(tmp_provs, tmp_maxprovs *
15668                             sizeof (dtrace_helper_provider_t *));
15669                 }
15670         }
15671
15672         help->dthps_provs[help->dthps_nprovs] = hprov;
15673         help->dthps_nprovs++;
15674
15675         return (0);
15676 }
15677
15678 static void
15679 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15680 {
15681         mutex_enter(&dtrace_lock);
15682
15683         if (--hprov->dthp_ref == 0) {
15684                 dof_hdr_t *dof;
15685                 mutex_exit(&dtrace_lock);
15686                 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15687                 dtrace_dof_destroy(dof);
15688                 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15689         } else {
15690                 mutex_exit(&dtrace_lock);
15691         }
15692 }
15693
15694 static int
15695 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15696 {
15697         uintptr_t daddr = (uintptr_t)dof;
15698         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15699         dof_provider_t *provider;
15700         dof_probe_t *probe;
15701         uint8_t *arg;
15702         char *strtab, *typestr;
15703         dof_stridx_t typeidx;
15704         size_t typesz;
15705         uint_t nprobes, j, k;
15706
15707         ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15708
15709         if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15710                 dtrace_dof_error(dof, "misaligned section offset");
15711                 return (-1);
15712         }
15713
15714         /*
15715          * The section needs to be large enough to contain the DOF provider
15716          * structure appropriate for the given version.
15717          */
15718         if (sec->dofs_size <
15719             ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15720             offsetof(dof_provider_t, dofpv_prenoffs) :
15721             sizeof (dof_provider_t))) {
15722                 dtrace_dof_error(dof, "provider section too small");
15723                 return (-1);
15724         }
15725
15726         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15727         str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15728         prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15729         arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15730         off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15731
15732         if (str_sec == NULL || prb_sec == NULL ||
15733             arg_sec == NULL || off_sec == NULL)
15734                 return (-1);
15735
15736         enoff_sec = NULL;
15737
15738         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15739             provider->dofpv_prenoffs != DOF_SECT_NONE &&
15740             (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15741             provider->dofpv_prenoffs)) == NULL)
15742                 return (-1);
15743
15744         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15745
15746         if (provider->dofpv_name >= str_sec->dofs_size ||
15747             strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15748                 dtrace_dof_error(dof, "invalid provider name");
15749                 return (-1);
15750         }
15751
15752         if (prb_sec->dofs_entsize == 0 ||
15753             prb_sec->dofs_entsize > prb_sec->dofs_size) {
15754                 dtrace_dof_error(dof, "invalid entry size");
15755                 return (-1);
15756         }
15757
15758         if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15759                 dtrace_dof_error(dof, "misaligned entry size");
15760                 return (-1);
15761         }
15762
15763         if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15764                 dtrace_dof_error(dof, "invalid entry size");
15765                 return (-1);
15766         }
15767
15768         if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15769                 dtrace_dof_error(dof, "misaligned section offset");
15770                 return (-1);
15771         }
15772
15773         if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15774                 dtrace_dof_error(dof, "invalid entry size");
15775                 return (-1);
15776         }
15777
15778         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15779
15780         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15781
15782         /*
15783          * Take a pass through the probes to check for errors.
15784          */
15785         for (j = 0; j < nprobes; j++) {
15786                 probe = (dof_probe_t *)(uintptr_t)(daddr +
15787                     prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15788
15789                 if (probe->dofpr_func >= str_sec->dofs_size) {
15790                         dtrace_dof_error(dof, "invalid function name");
15791                         return (-1);
15792                 }
15793
15794                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15795                         dtrace_dof_error(dof, "function name too long");
15796                         return (-1);
15797                 }
15798
15799                 if (probe->dofpr_name >= str_sec->dofs_size ||
15800                     strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15801                         dtrace_dof_error(dof, "invalid probe name");
15802                         return (-1);
15803                 }
15804
15805                 /*
15806                  * The offset count must not wrap the index, and the offsets
15807                  * must also not overflow the section's data.
15808                  */
15809                 if (probe->dofpr_offidx + probe->dofpr_noffs <
15810                     probe->dofpr_offidx ||
15811                     (probe->dofpr_offidx + probe->dofpr_noffs) *
15812                     off_sec->dofs_entsize > off_sec->dofs_size) {
15813                         dtrace_dof_error(dof, "invalid probe offset");
15814                         return (-1);
15815                 }
15816
15817                 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15818                         /*
15819                          * If there's no is-enabled offset section, make sure
15820                          * there aren't any is-enabled offsets. Otherwise
15821                          * perform the same checks as for probe offsets
15822                          * (immediately above).
15823                          */
15824                         if (enoff_sec == NULL) {
15825                                 if (probe->dofpr_enoffidx != 0 ||
15826                                     probe->dofpr_nenoffs != 0) {
15827                                         dtrace_dof_error(dof, "is-enabled "
15828                                             "offsets with null section");
15829                                         return (-1);
15830                                 }
15831                         } else if (probe->dofpr_enoffidx +
15832                             probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15833                             (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15834                             enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15835                                 dtrace_dof_error(dof, "invalid is-enabled "
15836                                     "offset");
15837                                 return (-1);
15838                         }
15839
15840                         if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15841                                 dtrace_dof_error(dof, "zero probe and "
15842                                     "is-enabled offsets");
15843                                 return (-1);
15844                         }
15845                 } else if (probe->dofpr_noffs == 0) {
15846                         dtrace_dof_error(dof, "zero probe offsets");
15847                         return (-1);
15848                 }
15849
15850                 if (probe->dofpr_argidx + probe->dofpr_xargc <
15851                     probe->dofpr_argidx ||
15852                     (probe->dofpr_argidx + probe->dofpr_xargc) *
15853                     arg_sec->dofs_entsize > arg_sec->dofs_size) {
15854                         dtrace_dof_error(dof, "invalid args");
15855                         return (-1);
15856                 }
15857
15858                 typeidx = probe->dofpr_nargv;
15859                 typestr = strtab + probe->dofpr_nargv;
15860                 for (k = 0; k < probe->dofpr_nargc; k++) {
15861                         if (typeidx >= str_sec->dofs_size) {
15862                                 dtrace_dof_error(dof, "bad "
15863                                     "native argument type");
15864                                 return (-1);
15865                         }
15866
15867                         typesz = strlen(typestr) + 1;
15868                         if (typesz > DTRACE_ARGTYPELEN) {
15869                                 dtrace_dof_error(dof, "native "
15870                                     "argument type too long");
15871                                 return (-1);
15872                         }
15873                         typeidx += typesz;
15874                         typestr += typesz;
15875                 }
15876
15877                 typeidx = probe->dofpr_xargv;
15878                 typestr = strtab + probe->dofpr_xargv;
15879                 for (k = 0; k < probe->dofpr_xargc; k++) {
15880                         if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15881                                 dtrace_dof_error(dof, "bad "
15882                                     "native argument index");
15883                                 return (-1);
15884                         }
15885
15886                         if (typeidx >= str_sec->dofs_size) {
15887                                 dtrace_dof_error(dof, "bad "
15888                                     "translated argument type");
15889                                 return (-1);
15890                         }
15891
15892                         typesz = strlen(typestr) + 1;
15893                         if (typesz > DTRACE_ARGTYPELEN) {
15894                                 dtrace_dof_error(dof, "translated argument "
15895                                     "type too long");
15896                                 return (-1);
15897                         }
15898
15899                         typeidx += typesz;
15900                         typestr += typesz;
15901                 }
15902         }
15903
15904         return (0);
15905 }
15906
15907 static int
15908 dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
15909 {
15910         dtrace_helpers_t *help;
15911         dtrace_vstate_t *vstate;
15912         dtrace_enabling_t *enab = NULL;
15913         int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15914         uintptr_t daddr = (uintptr_t)dof;
15915
15916         ASSERT(MUTEX_HELD(&dtrace_lock));
15917
15918         if ((help = curproc->p_dtrace_helpers) == NULL)
15919                 help = dtrace_helpers_create(curproc);
15920
15921         vstate = &help->dthps_vstate;
15922
15923         if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15924             dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15925                 dtrace_dof_destroy(dof);
15926                 return (rv);
15927         }
15928
15929         /*
15930          * Look for helper providers and validate their descriptions.
15931          */
15932         if (dhp != NULL) {
15933                 for (i = 0; i < dof->dofh_secnum; i++) {
15934                         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15935                             dof->dofh_secoff + i * dof->dofh_secsize);
15936
15937                         if (sec->dofs_type != DOF_SECT_PROVIDER)
15938                                 continue;
15939
15940                         if (dtrace_helper_provider_validate(dof, sec) != 0) {
15941                                 dtrace_enabling_destroy(enab);
15942                                 dtrace_dof_destroy(dof);
15943                                 return (-1);
15944                         }
15945
15946                         nprovs++;
15947                 }
15948         }
15949
15950         /*
15951          * Now we need to walk through the ECB descriptions in the enabling.
15952          */
15953         for (i = 0; i < enab->dten_ndesc; i++) {
15954                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15955                 dtrace_probedesc_t *desc = &ep->dted_probe;
15956
15957                 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
15958                         continue;
15959
15960                 if (strcmp(desc->dtpd_mod, "helper") != 0)
15961                         continue;
15962
15963                 if (strcmp(desc->dtpd_func, "ustack") != 0)
15964                         continue;
15965
15966                 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
15967                     ep)) != 0) {
15968                         /*
15969                          * Adding this helper action failed -- we are now going
15970                          * to rip out the entire generation and return failure.
15971                          */
15972                         (void) dtrace_helper_destroygen(help->dthps_generation);
15973                         dtrace_enabling_destroy(enab);
15974                         dtrace_dof_destroy(dof);
15975                         return (-1);
15976                 }
15977
15978                 nhelpers++;
15979         }
15980
15981         if (nhelpers < enab->dten_ndesc)
15982                 dtrace_dof_error(dof, "unmatched helpers");
15983
15984         gen = help->dthps_generation++;
15985         dtrace_enabling_destroy(enab);
15986
15987         if (dhp != NULL && nprovs > 0) {
15988                 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15989                 if (dtrace_helper_provider_add(dhp, gen) == 0) {
15990                         mutex_exit(&dtrace_lock);
15991                         dtrace_helper_provider_register(curproc, help, dhp);
15992                         mutex_enter(&dtrace_lock);
15993
15994                         destroy = 0;
15995                 }
15996         }
15997
15998         if (destroy)
15999                 dtrace_dof_destroy(dof);
16000
16001         return (gen);
16002 }
16003
16004 static dtrace_helpers_t *
16005 dtrace_helpers_create(proc_t *p)
16006 {
16007         dtrace_helpers_t *help;
16008
16009         ASSERT(MUTEX_HELD(&dtrace_lock));
16010         ASSERT(p->p_dtrace_helpers == NULL);
16011
16012         help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16013         help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16014             DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16015
16016         p->p_dtrace_helpers = help;
16017         dtrace_helpers++;
16018
16019         return (help);
16020 }
16021
16022 #if defined(sun)
16023 static
16024 #endif
16025 void
16026 dtrace_helpers_destroy(proc_t *p)
16027 {
16028         dtrace_helpers_t *help;
16029         dtrace_vstate_t *vstate;
16030 #if defined(sun)
16031         proc_t *p = curproc;
16032 #endif
16033         int i;
16034
16035         mutex_enter(&dtrace_lock);
16036
16037         ASSERT(p->p_dtrace_helpers != NULL);
16038         ASSERT(dtrace_helpers > 0);
16039
16040         help = p->p_dtrace_helpers;
16041         vstate = &help->dthps_vstate;
16042
16043         /*
16044          * We're now going to lose the help from this process.
16045          */
16046         p->p_dtrace_helpers = NULL;
16047         dtrace_sync();
16048
16049         /*
16050          * Destory the helper actions.
16051          */
16052         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16053                 dtrace_helper_action_t *h, *next;
16054
16055                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16056                         next = h->dtha_next;
16057                         dtrace_helper_action_destroy(h, vstate);
16058                         h = next;
16059                 }
16060         }
16061
16062         mutex_exit(&dtrace_lock);
16063
16064         /*
16065          * Destroy the helper providers.
16066          */
16067         if (help->dthps_maxprovs > 0) {
16068                 mutex_enter(&dtrace_meta_lock);
16069                 if (dtrace_meta_pid != NULL) {
16070                         ASSERT(dtrace_deferred_pid == NULL);
16071
16072                         for (i = 0; i < help->dthps_nprovs; i++) {
16073                                 dtrace_helper_provider_remove(
16074                                     &help->dthps_provs[i]->dthp_prov, p->p_pid);
16075                         }
16076                 } else {
16077                         mutex_enter(&dtrace_lock);
16078                         ASSERT(help->dthps_deferred == 0 ||
16079                             help->dthps_next != NULL ||
16080                             help->dthps_prev != NULL ||
16081                             help == dtrace_deferred_pid);
16082
16083                         /*
16084                          * Remove the helper from the deferred list.
16085                          */
16086                         if (help->dthps_next != NULL)
16087                                 help->dthps_next->dthps_prev = help->dthps_prev;
16088                         if (help->dthps_prev != NULL)
16089                                 help->dthps_prev->dthps_next = help->dthps_next;
16090                         if (dtrace_deferred_pid == help) {
16091                                 dtrace_deferred_pid = help->dthps_next;
16092                                 ASSERT(help->dthps_prev == NULL);
16093                         }
16094
16095                         mutex_exit(&dtrace_lock);
16096                 }
16097
16098                 mutex_exit(&dtrace_meta_lock);
16099
16100                 for (i = 0; i < help->dthps_nprovs; i++) {
16101                         dtrace_helper_provider_destroy(help->dthps_provs[i]);
16102                 }
16103
16104                 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16105                     sizeof (dtrace_helper_provider_t *));
16106         }
16107
16108         mutex_enter(&dtrace_lock);
16109
16110         dtrace_vstate_fini(&help->dthps_vstate);
16111         kmem_free(help->dthps_actions,
16112             sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16113         kmem_free(help, sizeof (dtrace_helpers_t));
16114
16115         --dtrace_helpers;
16116         mutex_exit(&dtrace_lock);
16117 }
16118
16119 #if defined(sun)
16120 static
16121 #endif
16122 void
16123 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16124 {
16125         dtrace_helpers_t *help, *newhelp;
16126         dtrace_helper_action_t *helper, *new, *last;
16127         dtrace_difo_t *dp;
16128         dtrace_vstate_t *vstate;
16129         int i, j, sz, hasprovs = 0;
16130
16131         mutex_enter(&dtrace_lock);
16132         ASSERT(from->p_dtrace_helpers != NULL);
16133         ASSERT(dtrace_helpers > 0);
16134
16135         help = from->p_dtrace_helpers;
16136         newhelp = dtrace_helpers_create(to);
16137         ASSERT(to->p_dtrace_helpers != NULL);
16138
16139         newhelp->dthps_generation = help->dthps_generation;
16140         vstate = &newhelp->dthps_vstate;
16141
16142         /*
16143          * Duplicate the helper actions.
16144          */
16145         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16146                 if ((helper = help->dthps_actions[i]) == NULL)
16147                         continue;
16148
16149                 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16150                         new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16151                             KM_SLEEP);
16152                         new->dtha_generation = helper->dtha_generation;
16153
16154                         if ((dp = helper->dtha_predicate) != NULL) {
16155                                 dp = dtrace_difo_duplicate(dp, vstate);
16156                                 new->dtha_predicate = dp;
16157                         }
16158
16159                         new->dtha_nactions = helper->dtha_nactions;
16160                         sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16161                         new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16162
16163                         for (j = 0; j < new->dtha_nactions; j++) {
16164                                 dtrace_difo_t *dp = helper->dtha_actions[j];
16165
16166                                 ASSERT(dp != NULL);
16167                                 dp = dtrace_difo_duplicate(dp, vstate);
16168                                 new->dtha_actions[j] = dp;
16169                         }
16170
16171                         if (last != NULL) {
16172                                 last->dtha_next = new;
16173                         } else {
16174                                 newhelp->dthps_actions[i] = new;
16175                         }
16176
16177                         last = new;
16178                 }
16179         }
16180
16181         /*
16182          * Duplicate the helper providers and register them with the
16183          * DTrace framework.
16184          */
16185         if (help->dthps_nprovs > 0) {
16186                 newhelp->dthps_nprovs = help->dthps_nprovs;
16187                 newhelp->dthps_maxprovs = help->dthps_nprovs;
16188                 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16189                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16190                 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16191                         newhelp->dthps_provs[i] = help->dthps_provs[i];
16192                         newhelp->dthps_provs[i]->dthp_ref++;
16193                 }
16194
16195                 hasprovs = 1;
16196         }
16197
16198         mutex_exit(&dtrace_lock);
16199
16200         if (hasprovs)
16201                 dtrace_helper_provider_register(to, newhelp, NULL);
16202 }
16203
16204 /*
16205  * DTrace Hook Functions
16206  */
16207 static void
16208 dtrace_module_loaded(modctl_t *ctl)
16209 {
16210         dtrace_provider_t *prv;
16211
16212         mutex_enter(&dtrace_provider_lock);
16213 #if defined(sun)
16214         mutex_enter(&mod_lock);
16215 #endif
16216
16217 #if defined(sun)
16218         ASSERT(ctl->mod_busy);
16219 #endif
16220
16221         /*
16222          * We're going to call each providers per-module provide operation
16223          * specifying only this module.
16224          */
16225         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16226                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16227
16228 #if defined(sun)
16229         mutex_exit(&mod_lock);
16230 #endif
16231         mutex_exit(&dtrace_provider_lock);
16232
16233         /*
16234          * If we have any retained enablings, we need to match against them.
16235          * Enabling probes requires that cpu_lock be held, and we cannot hold
16236          * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16237          * module.  (In particular, this happens when loading scheduling
16238          * classes.)  So if we have any retained enablings, we need to dispatch
16239          * our task queue to do the match for us.
16240          */
16241         mutex_enter(&dtrace_lock);
16242
16243         if (dtrace_retained == NULL) {
16244                 mutex_exit(&dtrace_lock);
16245                 return;
16246         }
16247
16248         (void) taskq_dispatch(dtrace_taskq,
16249             (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
16250
16251         mutex_exit(&dtrace_lock);
16252
16253         /*
16254          * And now, for a little heuristic sleaze:  in general, we want to
16255          * match modules as soon as they load.  However, we cannot guarantee
16256          * this, because it would lead us to the lock ordering violation
16257          * outlined above.  The common case, of course, is that cpu_lock is
16258          * _not_ held -- so we delay here for a clock tick, hoping that that's
16259          * long enough for the task queue to do its work.  If it's not, it's
16260          * not a serious problem -- it just means that the module that we
16261          * just loaded may not be immediately instrumentable.
16262          */
16263         delay(1);
16264 }
16265
16266 static void
16267 #if defined(sun)
16268 dtrace_module_unloaded(modctl_t *ctl)
16269 #else
16270 dtrace_module_unloaded(modctl_t *ctl, int *error)
16271 #endif
16272 {
16273         dtrace_probe_t template, *probe, *first, *next;
16274         dtrace_provider_t *prov;
16275 #if !defined(sun)
16276         char modname[DTRACE_MODNAMELEN];
16277         size_t len;
16278 #endif
16279
16280 #if defined(sun)
16281         template.dtpr_mod = ctl->mod_modname;
16282 #else
16283         /* Handle the fact that ctl->filename may end in ".ko". */
16284         strlcpy(modname, ctl->filename, sizeof(modname));
16285         len = strlen(ctl->filename);
16286         if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
16287                 modname[len - 3] = '\0';
16288         template.dtpr_mod = modname;
16289 #endif
16290
16291         mutex_enter(&dtrace_provider_lock);
16292 #if defined(sun)
16293         mutex_enter(&mod_lock);
16294 #endif
16295         mutex_enter(&dtrace_lock);
16296
16297 #if !defined(sun)
16298         if (ctl->nenabled > 0) {
16299                 /* Don't allow unloads if a probe is enabled. */
16300                 mutex_exit(&dtrace_provider_lock);
16301                 mutex_exit(&dtrace_lock);
16302                 *error = -1;
16303                 printf(
16304         "kldunload: attempt to unload module that has DTrace probes enabled\n");
16305                 return;
16306         }
16307 #endif
16308
16309         if (dtrace_bymod == NULL) {
16310                 /*
16311                  * The DTrace module is loaded (obviously) but not attached;
16312                  * we don't have any work to do.
16313                  */
16314                 mutex_exit(&dtrace_provider_lock);
16315 #if defined(sun)
16316                 mutex_exit(&mod_lock);
16317 #endif
16318                 mutex_exit(&dtrace_lock);
16319                 return;
16320         }
16321
16322         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16323             probe != NULL; probe = probe->dtpr_nextmod) {
16324                 if (probe->dtpr_ecb != NULL) {
16325                         mutex_exit(&dtrace_provider_lock);
16326 #if defined(sun)
16327                         mutex_exit(&mod_lock);
16328 #endif
16329                         mutex_exit(&dtrace_lock);
16330
16331                         /*
16332                          * This shouldn't _actually_ be possible -- we're
16333                          * unloading a module that has an enabled probe in it.
16334                          * (It's normally up to the provider to make sure that
16335                          * this can't happen.)  However, because dtps_enable()
16336                          * doesn't have a failure mode, there can be an
16337                          * enable/unload race.  Upshot:  we don't want to
16338                          * assert, but we're not going to disable the
16339                          * probe, either.
16340                          */
16341                         if (dtrace_err_verbose) {
16342 #if defined(sun)
16343                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16344                                     "enabled probes", ctl->mod_modname);
16345 #else
16346                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16347                                     "enabled probes", modname);
16348 #endif
16349                         }
16350
16351                         return;
16352                 }
16353         }
16354
16355         probe = first;
16356
16357         for (first = NULL; probe != NULL; probe = next) {
16358                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16359
16360                 dtrace_probes[probe->dtpr_id - 1] = NULL;
16361
16362                 next = probe->dtpr_nextmod;
16363                 dtrace_hash_remove(dtrace_bymod, probe);
16364                 dtrace_hash_remove(dtrace_byfunc, probe);
16365                 dtrace_hash_remove(dtrace_byname, probe);
16366
16367                 if (first == NULL) {
16368                         first = probe;
16369                         probe->dtpr_nextmod = NULL;
16370                 } else {
16371                         probe->dtpr_nextmod = first;
16372                         first = probe;
16373                 }
16374         }
16375
16376         /*
16377          * We've removed all of the module's probes from the hash chains and
16378          * from the probe array.  Now issue a dtrace_sync() to be sure that
16379          * everyone has cleared out from any probe array processing.
16380          */
16381         dtrace_sync();
16382
16383         for (probe = first; probe != NULL; probe = first) {
16384                 first = probe->dtpr_nextmod;
16385                 prov = probe->dtpr_provider;
16386                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16387                     probe->dtpr_arg);
16388                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16389                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16390                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16391 #if defined(sun)
16392                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16393 #else
16394                 free_unr(dtrace_arena, probe->dtpr_id);
16395 #endif
16396                 kmem_free(probe, sizeof (dtrace_probe_t));
16397         }
16398
16399         mutex_exit(&dtrace_lock);
16400 #if defined(sun)
16401         mutex_exit(&mod_lock);
16402 #endif
16403         mutex_exit(&dtrace_provider_lock);
16404 }
16405
16406 #if !defined(sun)
16407 static void
16408 dtrace_kld_load(void *arg __unused, linker_file_t lf)
16409 {
16410
16411         dtrace_module_loaded(lf);
16412 }
16413
16414 static void
16415 dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
16416 {
16417
16418         if (*error != 0)
16419                 /* We already have an error, so don't do anything. */
16420                 return;
16421         dtrace_module_unloaded(lf, error);
16422 }
16423 #endif
16424
16425 #if defined(sun)
16426 static void
16427 dtrace_suspend(void)
16428 {
16429         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16430 }
16431
16432 static void
16433 dtrace_resume(void)
16434 {
16435         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16436 }
16437 #endif
16438
16439 static int
16440 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16441 {
16442         ASSERT(MUTEX_HELD(&cpu_lock));
16443         mutex_enter(&dtrace_lock);
16444
16445         switch (what) {
16446         case CPU_CONFIG: {
16447                 dtrace_state_t *state;
16448                 dtrace_optval_t *opt, rs, c;
16449
16450                 /*
16451                  * For now, we only allocate a new buffer for anonymous state.
16452                  */
16453                 if ((state = dtrace_anon.dta_state) == NULL)
16454                         break;
16455
16456                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16457                         break;
16458
16459                 opt = state->dts_options;
16460                 c = opt[DTRACEOPT_CPU];
16461
16462                 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16463                         break;
16464
16465                 /*
16466                  * Regardless of what the actual policy is, we're going to
16467                  * temporarily set our resize policy to be manual.  We're
16468                  * also going to temporarily set our CPU option to denote
16469                  * the newly configured CPU.
16470                  */
16471                 rs = opt[DTRACEOPT_BUFRESIZE];
16472                 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16473                 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16474
16475                 (void) dtrace_state_buffers(state);
16476
16477                 opt[DTRACEOPT_BUFRESIZE] = rs;
16478                 opt[DTRACEOPT_CPU] = c;
16479
16480                 break;
16481         }
16482
16483         case CPU_UNCONFIG:
16484                 /*
16485                  * We don't free the buffer in the CPU_UNCONFIG case.  (The
16486                  * buffer will be freed when the consumer exits.)
16487                  */
16488                 break;
16489
16490         default:
16491                 break;
16492         }
16493
16494         mutex_exit(&dtrace_lock);
16495         return (0);
16496 }
16497
16498 #if defined(sun)
16499 static void
16500 dtrace_cpu_setup_initial(processorid_t cpu)
16501 {
16502         (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16503 }
16504 #endif
16505
16506 static void
16507 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16508 {
16509         if (dtrace_toxranges >= dtrace_toxranges_max) {
16510                 int osize, nsize;
16511                 dtrace_toxrange_t *range;
16512
16513                 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16514
16515                 if (osize == 0) {
16516                         ASSERT(dtrace_toxrange == NULL);
16517                         ASSERT(dtrace_toxranges_max == 0);
16518                         dtrace_toxranges_max = 1;
16519                 } else {
16520                         dtrace_toxranges_max <<= 1;
16521                 }
16522
16523                 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16524                 range = kmem_zalloc(nsize, KM_SLEEP);
16525
16526                 if (dtrace_toxrange != NULL) {
16527                         ASSERT(osize != 0);
16528                         bcopy(dtrace_toxrange, range, osize);
16529                         kmem_free(dtrace_toxrange, osize);
16530                 }
16531
16532                 dtrace_toxrange = range;
16533         }
16534
16535         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
16536         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
16537
16538         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16539         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16540         dtrace_toxranges++;
16541 }
16542
16543 static void
16544 dtrace_getf_barrier()
16545 {
16546 #if defined(sun)
16547         /*
16548          * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
16549          * that contain calls to getf(), this routine will be called on every
16550          * closef() before either the underlying vnode is released or the
16551          * file_t itself is freed.  By the time we are here, it is essential
16552          * that the file_t can no longer be accessed from a call to getf()
16553          * in probe context -- that assures that a dtrace_sync() can be used
16554          * to clear out any enablings referring to the old structures.
16555          */
16556         if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
16557             kcred->cr_zone->zone_dtrace_getf != 0)
16558                 dtrace_sync();
16559 #endif
16560 }
16561
16562 /*
16563  * DTrace Driver Cookbook Functions
16564  */
16565 #if defined(sun)
16566 /*ARGSUSED*/
16567 static int
16568 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
16569 {
16570         dtrace_provider_id_t id;
16571         dtrace_state_t *state = NULL;
16572         dtrace_enabling_t *enab;
16573
16574         mutex_enter(&cpu_lock);
16575         mutex_enter(&dtrace_provider_lock);
16576         mutex_enter(&dtrace_lock);
16577
16578         if (ddi_soft_state_init(&dtrace_softstate,
16579             sizeof (dtrace_state_t), 0) != 0) {
16580                 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
16581                 mutex_exit(&cpu_lock);
16582                 mutex_exit(&dtrace_provider_lock);
16583                 mutex_exit(&dtrace_lock);
16584                 return (DDI_FAILURE);
16585         }
16586
16587         if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
16588             DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
16589             ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
16590             DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
16591                 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
16592                 ddi_remove_minor_node(devi, NULL);
16593                 ddi_soft_state_fini(&dtrace_softstate);
16594                 mutex_exit(&cpu_lock);
16595                 mutex_exit(&dtrace_provider_lock);
16596                 mutex_exit(&dtrace_lock);
16597                 return (DDI_FAILURE);
16598         }
16599
16600         ddi_report_dev(devi);
16601         dtrace_devi = devi;
16602
16603         dtrace_modload = dtrace_module_loaded;
16604         dtrace_modunload = dtrace_module_unloaded;
16605         dtrace_cpu_init = dtrace_cpu_setup_initial;
16606         dtrace_helpers_cleanup = dtrace_helpers_destroy;
16607         dtrace_helpers_fork = dtrace_helpers_duplicate;
16608         dtrace_cpustart_init = dtrace_suspend;
16609         dtrace_cpustart_fini = dtrace_resume;
16610         dtrace_debugger_init = dtrace_suspend;
16611         dtrace_debugger_fini = dtrace_resume;
16612
16613         register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16614
16615         ASSERT(MUTEX_HELD(&cpu_lock));
16616
16617         dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16618             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16619         dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
16620             UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
16621             VM_SLEEP | VMC_IDENTIFIER);
16622         dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
16623             1, INT_MAX, 0);
16624
16625         dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16626             sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
16627             NULL, NULL, NULL, NULL, NULL, 0);
16628
16629         ASSERT(MUTEX_HELD(&cpu_lock));
16630         dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
16631             offsetof(dtrace_probe_t, dtpr_nextmod),
16632             offsetof(dtrace_probe_t, dtpr_prevmod));
16633
16634         dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
16635             offsetof(dtrace_probe_t, dtpr_nextfunc),
16636             offsetof(dtrace_probe_t, dtpr_prevfunc));
16637
16638         dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
16639             offsetof(dtrace_probe_t, dtpr_nextname),
16640             offsetof(dtrace_probe_t, dtpr_prevname));
16641
16642         if (dtrace_retain_max < 1) {
16643                 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16644                     "setting to 1", dtrace_retain_max);
16645                 dtrace_retain_max = 1;
16646         }
16647
16648         /*
16649          * Now discover our toxic ranges.
16650          */
16651         dtrace_toxic_ranges(dtrace_toxrange_add);
16652
16653         /*
16654          * Before we register ourselves as a provider to our own framework,
16655          * we would like to assert that dtrace_provider is NULL -- but that's
16656          * not true if we were loaded as a dependency of a DTrace provider.
16657          * Once we've registered, we can assert that dtrace_provider is our
16658          * pseudo provider.
16659          */
16660         (void) dtrace_register("dtrace", &dtrace_provider_attr,
16661             DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16662
16663         ASSERT(dtrace_provider != NULL);
16664         ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16665
16666         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16667             dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
16668         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16669             dtrace_provider, NULL, NULL, "END", 0, NULL);
16670         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16671             dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
16672
16673         dtrace_anon_property();
16674         mutex_exit(&cpu_lock);
16675
16676         /*
16677          * If DTrace helper tracing is enabled, we need to allocate the
16678          * trace buffer and initialize the values.
16679          */
16680         if (dtrace_helptrace_enabled) {
16681                 ASSERT(dtrace_helptrace_buffer == NULL);
16682                 dtrace_helptrace_buffer =
16683                     kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16684                 dtrace_helptrace_next = 0;
16685         }
16686
16687         /*
16688          * If there are already providers, we must ask them to provide their
16689          * probes, and then match any anonymous enabling against them.  Note
16690          * that there should be no other retained enablings at this time:
16691          * the only retained enablings at this time should be the anonymous
16692          * enabling.
16693          */
16694         if (dtrace_anon.dta_enabling != NULL) {
16695                 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16696
16697                 dtrace_enabling_provide(NULL);
16698                 state = dtrace_anon.dta_state;
16699
16700                 /*
16701                  * We couldn't hold cpu_lock across the above call to
16702                  * dtrace_enabling_provide(), but we must hold it to actually
16703                  * enable the probes.  We have to drop all of our locks, pick
16704                  * up cpu_lock, and regain our locks before matching the
16705                  * retained anonymous enabling.
16706                  */
16707                 mutex_exit(&dtrace_lock);
16708                 mutex_exit(&dtrace_provider_lock);
16709
16710                 mutex_enter(&cpu_lock);
16711                 mutex_enter(&dtrace_provider_lock);
16712                 mutex_enter(&dtrace_lock);
16713
16714                 if ((enab = dtrace_anon.dta_enabling) != NULL)
16715                         (void) dtrace_enabling_match(enab, NULL);
16716
16717                 mutex_exit(&cpu_lock);
16718         }
16719
16720         mutex_exit(&dtrace_lock);
16721         mutex_exit(&dtrace_provider_lock);
16722
16723         if (state != NULL) {
16724                 /*
16725                  * If we created any anonymous state, set it going now.
16726                  */
16727                 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
16728         }
16729
16730         return (DDI_SUCCESS);
16731 }
16732 #endif
16733
16734 #if !defined(sun)
16735 static void dtrace_dtr(void *);
16736 #endif
16737
16738 /*ARGSUSED*/
16739 static int
16740 #if defined(sun)
16741 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
16742 #else
16743 dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
16744 #endif
16745 {
16746         dtrace_state_t *state;
16747         uint32_t priv;
16748         uid_t uid;
16749         zoneid_t zoneid;
16750
16751 #if defined(sun)
16752         if (getminor(*devp) == DTRACEMNRN_HELPER)
16753                 return (0);
16754
16755         /*
16756          * If this wasn't an open with the "helper" minor, then it must be
16757          * the "dtrace" minor.
16758          */
16759         if (getminor(*devp) == DTRACEMNRN_DTRACE)
16760                 return (ENXIO);
16761 #else
16762         cred_t *cred_p = NULL;
16763         cred_p = dev->si_cred;
16764
16765         /*
16766          * If no DTRACE_PRIV_* bits are set in the credential, then the
16767          * caller lacks sufficient permission to do anything with DTrace.
16768          */
16769         dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
16770         if (priv == DTRACE_PRIV_NONE) {
16771 #endif
16772
16773                 return (EACCES);
16774         }
16775
16776         /*
16777          * Ask all providers to provide all their probes.
16778          */
16779         mutex_enter(&dtrace_provider_lock);
16780         dtrace_probe_provide(NULL, NULL);
16781         mutex_exit(&dtrace_provider_lock);
16782
16783         mutex_enter(&cpu_lock);
16784         mutex_enter(&dtrace_lock);
16785         dtrace_opens++;
16786         dtrace_membar_producer();
16787
16788 #if defined(sun)
16789         /*
16790          * If the kernel debugger is active (that is, if the kernel debugger
16791          * modified text in some way), we won't allow the open.
16792          */
16793         if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
16794                 dtrace_opens--;
16795                 mutex_exit(&cpu_lock);
16796                 mutex_exit(&dtrace_lock);
16797                 return (EBUSY);
16798         }
16799
16800         state = dtrace_state_create(devp, cred_p);
16801 #else
16802         state = dtrace_state_create(dev);
16803         devfs_set_cdevpriv(state, dtrace_dtr);
16804 #endif
16805
16806         mutex_exit(&cpu_lock);
16807
16808         if (state == NULL) {
16809 #if defined(sun)
16810                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16811                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16812 #else
16813                 --dtrace_opens;
16814 #endif
16815                 mutex_exit(&dtrace_lock);
16816                 return (EAGAIN);
16817         }
16818
16819         mutex_exit(&dtrace_lock);
16820
16821         return (0);
16822 }
16823
16824 /*ARGSUSED*/
16825 #if defined(sun)
16826 static int
16827 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
16828 #else
16829 static void
16830 dtrace_dtr(void *data)
16831 #endif
16832 {
16833 #if defined(sun)
16834         minor_t minor = getminor(dev);
16835         dtrace_state_t *state;
16836
16837         if (minor == DTRACEMNRN_HELPER)
16838                 return (0);
16839
16840         state = ddi_get_soft_state(dtrace_softstate, minor);
16841 #else
16842         dtrace_state_t *state = data;
16843 #endif
16844
16845         mutex_enter(&cpu_lock);
16846         mutex_enter(&dtrace_lock);
16847
16848         if (state != NULL) {
16849                 if (state->dts_anon) {
16850                         /*
16851                          * There is anonymous state. Destroy that first.
16852                          */
16853                         ASSERT(dtrace_anon.dta_state == NULL);
16854                         dtrace_state_destroy(state->dts_anon);
16855                 }
16856
16857                 dtrace_state_destroy(state);
16858
16859 #if !defined(sun)
16860                 kmem_free(state, 0);
16861 #endif
16862         }
16863
16864         ASSERT(dtrace_opens > 0);
16865 #if defined(sun)
16866         /*
16867          * Only relinquish control of the kernel debugger interface when there
16868          * are no consumers and no anonymous enablings.
16869          */
16870         if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16871                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16872 #else
16873         --dtrace_opens;
16874 #endif
16875
16876         mutex_exit(&dtrace_lock);
16877         mutex_exit(&cpu_lock);
16878
16879 #if defined(sun)
16880         return (0);
16881 #endif
16882 }
16883
16884 #if defined(sun)
16885 /*ARGSUSED*/
16886 static int
16887 dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
16888 {
16889         int rval;
16890         dof_helper_t help, *dhp = NULL;
16891
16892         switch (cmd) {
16893         case DTRACEHIOC_ADDDOF:
16894                 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
16895                         dtrace_dof_error(NULL, "failed to copyin DOF helper");
16896                         return (EFAULT);
16897                 }
16898
16899                 dhp = &help;
16900                 arg = (intptr_t)help.dofhp_dof;
16901                 /*FALLTHROUGH*/
16902
16903         case DTRACEHIOC_ADD: {
16904                 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
16905
16906                 if (dof == NULL)
16907                         return (rval);
16908
16909                 mutex_enter(&dtrace_lock);
16910
16911                 /*
16912                  * dtrace_helper_slurp() takes responsibility for the dof --
16913                  * it may free it now or it may save it and free it later.
16914                  */
16915                 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
16916                         *rv = rval;
16917                         rval = 0;
16918                 } else {
16919                         rval = EINVAL;
16920                 }
16921
16922                 mutex_exit(&dtrace_lock);
16923                 return (rval);
16924         }
16925
16926         case DTRACEHIOC_REMOVE: {
16927                 mutex_enter(&dtrace_lock);
16928                 rval = dtrace_helper_destroygen(arg);
16929                 mutex_exit(&dtrace_lock);
16930
16931                 return (rval);
16932         }
16933
16934         default:
16935                 break;
16936         }
16937
16938         return (ENOTTY);
16939 }
16940
16941 /*ARGSUSED*/
16942 static int
16943 dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
16944 {
16945         minor_t minor = getminor(dev);
16946         dtrace_state_t *state;
16947         int rval;
16948
16949         if (minor == DTRACEMNRN_HELPER)
16950                 return (dtrace_ioctl_helper(cmd, arg, rv));
16951
16952         state = ddi_get_soft_state(dtrace_softstate, minor);
16953
16954         if (state->dts_anon) {
16955                 ASSERT(dtrace_anon.dta_state == NULL);
16956                 state = state->dts_anon;
16957         }
16958
16959         switch (cmd) {
16960         case DTRACEIOC_PROVIDER: {
16961                 dtrace_providerdesc_t pvd;
16962                 dtrace_provider_t *pvp;
16963
16964                 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
16965                         return (EFAULT);
16966
16967                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
16968                 mutex_enter(&dtrace_provider_lock);
16969
16970                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
16971                         if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
16972                                 break;
16973                 }
16974
16975                 mutex_exit(&dtrace_provider_lock);
16976
16977                 if (pvp == NULL)
16978                         return (ESRCH);
16979
16980                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
16981                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
16982
16983                 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
16984                         return (EFAULT);
16985
16986                 return (0);
16987         }
16988
16989         case DTRACEIOC_EPROBE: {
16990                 dtrace_eprobedesc_t epdesc;
16991                 dtrace_ecb_t *ecb;
16992                 dtrace_action_t *act;
16993                 void *buf;
16994                 size_t size;
16995                 uintptr_t dest;
16996                 int nrecs;
16997
16998                 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
16999                         return (EFAULT);
17000
17001                 mutex_enter(&dtrace_lock);
17002
17003                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17004                         mutex_exit(&dtrace_lock);
17005                         return (EINVAL);
17006                 }
17007
17008                 if (ecb->dte_probe == NULL) {
17009                         mutex_exit(&dtrace_lock);
17010                         return (EINVAL);
17011                 }
17012
17013                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17014                 epdesc.dtepd_uarg = ecb->dte_uarg;
17015                 epdesc.dtepd_size = ecb->dte_size;
17016
17017                 nrecs = epdesc.dtepd_nrecs;
17018                 epdesc.dtepd_nrecs = 0;
17019                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17020                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17021                                 continue;
17022
17023                         epdesc.dtepd_nrecs++;
17024                 }
17025
17026                 /*
17027                  * Now that we have the size, we need to allocate a temporary
17028                  * buffer in which to store the complete description.  We need
17029                  * the temporary buffer to be able to drop dtrace_lock()
17030                  * across the copyout(), below.
17031                  */
17032                 size = sizeof (dtrace_eprobedesc_t) +
17033                     (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17034
17035                 buf = kmem_alloc(size, KM_SLEEP);
17036                 dest = (uintptr_t)buf;
17037
17038                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17039                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17040
17041                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17042                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17043                                 continue;
17044
17045                         if (nrecs-- == 0)
17046                                 break;
17047
17048                         bcopy(&act->dta_rec, (void *)dest,
17049                             sizeof (dtrace_recdesc_t));
17050                         dest += sizeof (dtrace_recdesc_t);
17051                 }
17052
17053                 mutex_exit(&dtrace_lock);
17054
17055                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17056                         kmem_free(buf, size);
17057                         return (EFAULT);
17058                 }
17059
17060                 kmem_free(buf, size);
17061                 return (0);
17062         }
17063
17064         case DTRACEIOC_AGGDESC: {
17065                 dtrace_aggdesc_t aggdesc;
17066                 dtrace_action_t *act;
17067                 dtrace_aggregation_t *agg;
17068                 int nrecs;
17069                 uint32_t offs;
17070                 dtrace_recdesc_t *lrec;
17071                 void *buf;
17072                 size_t size;
17073                 uintptr_t dest;
17074
17075                 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17076                         return (EFAULT);
17077
17078                 mutex_enter(&dtrace_lock);
17079
17080                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17081                         mutex_exit(&dtrace_lock);
17082                         return (EINVAL);
17083                 }
17084
17085                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17086
17087                 nrecs = aggdesc.dtagd_nrecs;
17088                 aggdesc.dtagd_nrecs = 0;
17089
17090                 offs = agg->dtag_base;
17091                 lrec = &agg->dtag_action.dta_rec;
17092                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17093
17094                 for (act = agg->dtag_first; ; act = act->dta_next) {
17095                         ASSERT(act->dta_intuple ||
17096                             DTRACEACT_ISAGG(act->dta_kind));
17097
17098                         /*
17099                          * If this action has a record size of zero, it
17100                          * denotes an argument to the aggregating action.
17101                          * Because the presence of this record doesn't (or
17102                          * shouldn't) affect the way the data is interpreted,
17103                          * we don't copy it out to save user-level the
17104                          * confusion of dealing with a zero-length record.
17105                          */
17106                         if (act->dta_rec.dtrd_size == 0) {
17107                                 ASSERT(agg->dtag_hasarg);
17108                                 continue;
17109                         }
17110
17111                         aggdesc.dtagd_nrecs++;
17112
17113                         if (act == &agg->dtag_action)
17114                                 break;
17115                 }
17116
17117                 /*
17118                  * Now that we have the size, we need to allocate a temporary
17119                  * buffer in which to store the complete description.  We need
17120                  * the temporary buffer to be able to drop dtrace_lock()
17121                  * across the copyout(), below.
17122                  */
17123                 size = sizeof (dtrace_aggdesc_t) +
17124                     (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17125
17126                 buf = kmem_alloc(size, KM_SLEEP);
17127                 dest = (uintptr_t)buf;
17128
17129                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17130                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17131
17132                 for (act = agg->dtag_first; ; act = act->dta_next) {
17133                         dtrace_recdesc_t rec = act->dta_rec;
17134
17135                         /*
17136                          * See the comment in the above loop for why we pass
17137                          * over zero-length records.
17138                          */
17139                         if (rec.dtrd_size == 0) {
17140                                 ASSERT(agg->dtag_hasarg);
17141                                 continue;
17142                         }
17143
17144                         if (nrecs-- == 0)
17145                                 break;
17146
17147                         rec.dtrd_offset -= offs;
17148                         bcopy(&rec, (void *)dest, sizeof (rec));
17149                         dest += sizeof (dtrace_recdesc_t);
17150
17151                         if (act == &agg->dtag_action)
17152                                 break;
17153                 }
17154
17155                 mutex_exit(&dtrace_lock);
17156
17157                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17158                         kmem_free(buf, size);
17159                         return (EFAULT);
17160                 }
17161
17162                 kmem_free(buf, size);
17163                 return (0);
17164         }
17165
17166         case DTRACEIOC_ENABLE: {
17167                 dof_hdr_t *dof;
17168                 dtrace_enabling_t *enab = NULL;
17169                 dtrace_vstate_t *vstate;
17170                 int err = 0;
17171
17172                 *rv = 0;
17173
17174                 /*
17175                  * If a NULL argument has been passed, we take this as our
17176                  * cue to reevaluate our enablings.
17177                  */
17178                 if (arg == NULL) {
17179                         dtrace_enabling_matchall();
17180
17181                         return (0);
17182                 }
17183
17184                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17185                         return (rval);
17186
17187                 mutex_enter(&cpu_lock);
17188                 mutex_enter(&dtrace_lock);
17189                 vstate = &state->dts_vstate;
17190
17191                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17192                         mutex_exit(&dtrace_lock);
17193                         mutex_exit(&cpu_lock);
17194                         dtrace_dof_destroy(dof);
17195                         return (EBUSY);
17196                 }
17197
17198                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17199                         mutex_exit(&dtrace_lock);
17200                         mutex_exit(&cpu_lock);
17201                         dtrace_dof_destroy(dof);
17202                         return (EINVAL);
17203                 }
17204
17205                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17206                         dtrace_enabling_destroy(enab);
17207                         mutex_exit(&dtrace_lock);
17208                         mutex_exit(&cpu_lock);
17209                         dtrace_dof_destroy(dof);
17210                         return (rval);
17211                 }
17212
17213                 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17214                         err = dtrace_enabling_retain(enab);
17215                 } else {
17216                         dtrace_enabling_destroy(enab);
17217                 }
17218
17219                 mutex_exit(&cpu_lock);
17220                 mutex_exit(&dtrace_lock);
17221                 dtrace_dof_destroy(dof);
17222
17223                 return (err);
17224         }
17225
17226         case DTRACEIOC_REPLICATE: {
17227                 dtrace_repldesc_t desc;
17228                 dtrace_probedesc_t *match = &desc.dtrpd_match;
17229                 dtrace_probedesc_t *create = &desc.dtrpd_create;
17230                 int err;
17231
17232                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17233                         return (EFAULT);
17234
17235                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17236                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17237                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17238                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17239
17240                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17241                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17242                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17243                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17244
17245                 mutex_enter(&dtrace_lock);
17246                 err = dtrace_enabling_replicate(state, match, create);
17247                 mutex_exit(&dtrace_lock);
17248
17249                 return (err);
17250         }
17251
17252         case DTRACEIOC_PROBEMATCH:
17253         case DTRACEIOC_PROBES: {
17254                 dtrace_probe_t *probe = NULL;
17255                 dtrace_probedesc_t desc;
17256                 dtrace_probekey_t pkey;
17257                 dtrace_id_t i;
17258                 int m = 0;
17259                 uint32_t priv;
17260                 uid_t uid;
17261                 zoneid_t zoneid;
17262
17263                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17264                         return (EFAULT);
17265
17266                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17267                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17268                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17269                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17270
17271                 /*
17272                  * Before we attempt to match this probe, we want to give
17273                  * all providers the opportunity to provide it.
17274                  */
17275                 if (desc.dtpd_id == DTRACE_IDNONE) {
17276                         mutex_enter(&dtrace_provider_lock);
17277                         dtrace_probe_provide(&desc, NULL);
17278                         mutex_exit(&dtrace_provider_lock);
17279                         desc.dtpd_id++;
17280                 }
17281
17282                 if (cmd == DTRACEIOC_PROBEMATCH)  {
17283                         dtrace_probekey(&desc, &pkey);
17284                         pkey.dtpk_id = DTRACE_IDNONE;
17285                 }
17286
17287                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17288
17289                 mutex_enter(&dtrace_lock);
17290
17291                 if (cmd == DTRACEIOC_PROBEMATCH) {
17292                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17293                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17294                                     (m = dtrace_match_probe(probe, &pkey,
17295                                     priv, uid, zoneid)) != 0)
17296                                         break;
17297                         }
17298
17299                         if (m < 0) {
17300                                 mutex_exit(&dtrace_lock);
17301                                 return (EINVAL);
17302                         }
17303
17304                 } else {
17305                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17306                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17307                                     dtrace_match_priv(probe, priv, uid, zoneid))
17308                                         break;
17309                         }
17310                 }
17311
17312                 if (probe == NULL) {
17313                         mutex_exit(&dtrace_lock);
17314                         return (ESRCH);
17315                 }
17316
17317                 dtrace_probe_description(probe, &desc);
17318                 mutex_exit(&dtrace_lock);
17319
17320                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17321                         return (EFAULT);
17322
17323                 return (0);
17324         }
17325
17326         case DTRACEIOC_PROBEARG: {
17327                 dtrace_argdesc_t desc;
17328                 dtrace_probe_t *probe;
17329                 dtrace_provider_t *prov;
17330
17331                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17332                         return (EFAULT);
17333
17334                 if (desc.dtargd_id == DTRACE_IDNONE)
17335                         return (EINVAL);
17336
17337                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
17338                         return (EINVAL);
17339
17340                 mutex_enter(&dtrace_provider_lock);
17341                 mutex_enter(&mod_lock);
17342                 mutex_enter(&dtrace_lock);
17343
17344                 if (desc.dtargd_id > dtrace_nprobes) {
17345                         mutex_exit(&dtrace_lock);
17346                         mutex_exit(&mod_lock);
17347                         mutex_exit(&dtrace_provider_lock);
17348                         return (EINVAL);
17349                 }
17350
17351                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17352                         mutex_exit(&dtrace_lock);
17353                         mutex_exit(&mod_lock);
17354                         mutex_exit(&dtrace_provider_lock);
17355                         return (EINVAL);
17356                 }
17357
17358                 mutex_exit(&dtrace_lock);
17359
17360                 prov = probe->dtpr_provider;
17361
17362                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17363                         /*
17364                          * There isn't any typed information for this probe.
17365                          * Set the argument number to DTRACE_ARGNONE.
17366                          */
17367                         desc.dtargd_ndx = DTRACE_ARGNONE;
17368                 } else {
17369                         desc.dtargd_native[0] = '\0';
17370                         desc.dtargd_xlate[0] = '\0';
17371                         desc.dtargd_mapping = desc.dtargd_ndx;
17372
17373                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17374                             probe->dtpr_id, probe->dtpr_arg, &desc);
17375                 }
17376
17377                 mutex_exit(&mod_lock);
17378                 mutex_exit(&dtrace_provider_lock);
17379
17380                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17381                         return (EFAULT);
17382
17383                 return (0);
17384         }
17385
17386         case DTRACEIOC_GO: {
17387                 processorid_t cpuid;
17388                 rval = dtrace_state_go(state, &cpuid);
17389
17390                 if (rval != 0)
17391                         return (rval);
17392
17393                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17394                         return (EFAULT);
17395
17396                 return (0);
17397         }
17398
17399         case DTRACEIOC_STOP: {
17400                 processorid_t cpuid;
17401
17402                 mutex_enter(&dtrace_lock);
17403                 rval = dtrace_state_stop(state, &cpuid);
17404                 mutex_exit(&dtrace_lock);
17405
17406                 if (rval != 0)
17407                         return (rval);
17408
17409                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17410                         return (EFAULT);
17411
17412                 return (0);
17413         }
17414
17415         case DTRACEIOC_DOFGET: {
17416                 dof_hdr_t hdr, *dof;
17417                 uint64_t len;
17418
17419                 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17420                         return (EFAULT);
17421
17422                 mutex_enter(&dtrace_lock);
17423                 dof = dtrace_dof_create(state);
17424                 mutex_exit(&dtrace_lock);
17425
17426                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17427                 rval = copyout(dof, (void *)arg, len);
17428                 dtrace_dof_destroy(dof);
17429
17430                 return (rval == 0 ? 0 : EFAULT);
17431         }
17432
17433         case DTRACEIOC_AGGSNAP:
17434         case DTRACEIOC_BUFSNAP: {
17435                 dtrace_bufdesc_t desc;
17436                 caddr_t cached;
17437                 dtrace_buffer_t *buf;
17438
17439                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17440                         return (EFAULT);
17441
17442                 if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17443                         return (EINVAL);
17444
17445                 mutex_enter(&dtrace_lock);
17446
17447                 if (cmd == DTRACEIOC_BUFSNAP) {
17448                         buf = &state->dts_buffer[desc.dtbd_cpu];
17449                 } else {
17450                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17451                 }
17452
17453                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17454                         size_t sz = buf->dtb_offset;
17455
17456                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17457                                 mutex_exit(&dtrace_lock);
17458                                 return (EBUSY);
17459                         }
17460
17461                         /*
17462                          * If this buffer has already been consumed, we're
17463                          * going to indicate that there's nothing left here
17464                          * to consume.
17465                          */
17466                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17467                                 mutex_exit(&dtrace_lock);
17468
17469                                 desc.dtbd_size = 0;
17470                                 desc.dtbd_drops = 0;
17471                                 desc.dtbd_errors = 0;
17472                                 desc.dtbd_oldest = 0;
17473                                 sz = sizeof (desc);
17474
17475                                 if (copyout(&desc, (void *)arg, sz) != 0)
17476                                         return (EFAULT);
17477
17478                                 return (0);
17479                         }
17480
17481                         /*
17482                          * If this is a ring buffer that has wrapped, we want
17483                          * to copy the whole thing out.
17484                          */
17485                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17486                                 dtrace_buffer_polish(buf);
17487                                 sz = buf->dtb_size;
17488                         }
17489
17490                         if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
17491                                 mutex_exit(&dtrace_lock);
17492                                 return (EFAULT);
17493                         }
17494
17495                         desc.dtbd_size = sz;
17496                         desc.dtbd_drops = buf->dtb_drops;
17497                         desc.dtbd_errors = buf->dtb_errors;
17498                         desc.dtbd_oldest = buf->dtb_xamot_offset;
17499                         desc.dtbd_timestamp = dtrace_gethrtime();
17500
17501                         mutex_exit(&dtrace_lock);
17502
17503                         if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17504                                 return (EFAULT);
17505
17506                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
17507
17508                         return (0);
17509                 }
17510
17511                 if (buf->dtb_tomax == NULL) {
17512                         ASSERT(buf->dtb_xamot == NULL);
17513                         mutex_exit(&dtrace_lock);
17514                         return (ENOENT);
17515                 }
17516
17517                 cached = buf->dtb_tomax;
17518                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17519
17520                 dtrace_xcall(desc.dtbd_cpu,
17521                     (dtrace_xcall_t)dtrace_buffer_switch, buf);
17522
17523                 state->dts_errors += buf->dtb_xamot_errors;
17524
17525                 /*
17526                  * If the buffers did not actually switch, then the cross call
17527                  * did not take place -- presumably because the given CPU is
17528                  * not in the ready set.  If this is the case, we'll return
17529                  * ENOENT.
17530                  */
17531                 if (buf->dtb_tomax == cached) {
17532                         ASSERT(buf->dtb_xamot != cached);
17533                         mutex_exit(&dtrace_lock);
17534                         return (ENOENT);
17535                 }
17536
17537                 ASSERT(cached == buf->dtb_xamot);
17538
17539                 /*
17540                  * We have our snapshot; now copy it out.
17541                  */
17542                 if (copyout(buf->dtb_xamot, desc.dtbd_data,
17543                     buf->dtb_xamot_offset) != 0) {
17544                         mutex_exit(&dtrace_lock);
17545                         return (EFAULT);
17546                 }
17547
17548                 desc.dtbd_size = buf->dtb_xamot_offset;
17549                 desc.dtbd_drops = buf->dtb_xamot_drops;
17550                 desc.dtbd_errors = buf->dtb_xamot_errors;
17551                 desc.dtbd_oldest = 0;
17552                 desc.dtbd_timestamp = buf->dtb_switched;
17553
17554                 mutex_exit(&dtrace_lock);
17555
17556                 /*
17557                  * Finally, copy out the buffer description.
17558                  */
17559                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17560                         return (EFAULT);
17561
17562                 return (0);
17563         }
17564
17565         case DTRACEIOC_CONF: {
17566                 dtrace_conf_t conf;
17567
17568                 bzero(&conf, sizeof (conf));
17569                 conf.dtc_difversion = DIF_VERSION;
17570                 conf.dtc_difintregs = DIF_DIR_NREGS;
17571                 conf.dtc_diftupregs = DIF_DTR_NREGS;
17572                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17573
17574                 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
17575                         return (EFAULT);
17576
17577                 return (0);
17578         }
17579
17580         case DTRACEIOC_STATUS: {
17581                 dtrace_status_t stat;
17582                 dtrace_dstate_t *dstate;
17583                 int i, j;
17584                 uint64_t nerrs;
17585
17586                 /*
17587                  * See the comment in dtrace_state_deadman() for the reason
17588                  * for setting dts_laststatus to INT64_MAX before setting
17589                  * it to the correct value.
17590                  */
17591                 state->dts_laststatus = INT64_MAX;
17592                 dtrace_membar_producer();
17593                 state->dts_laststatus = dtrace_gethrtime();
17594
17595                 bzero(&stat, sizeof (stat));
17596
17597                 mutex_enter(&dtrace_lock);
17598
17599                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17600                         mutex_exit(&dtrace_lock);
17601                         return (ENOENT);
17602                 }
17603
17604                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17605                         stat.dtst_exiting = 1;
17606
17607                 nerrs = state->dts_errors;
17608                 dstate = &state->dts_vstate.dtvs_dynvars;
17609
17610                 for (i = 0; i < NCPU; i++) {
17611                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17612
17613                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
17614                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17615                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17616
17617                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17618                                 stat.dtst_filled++;
17619
17620                         nerrs += state->dts_buffer[i].dtb_errors;
17621
17622                         for (j = 0; j < state->dts_nspeculations; j++) {
17623                                 dtrace_speculation_t *spec;
17624                                 dtrace_buffer_t *buf;
17625
17626                                 spec = &state->dts_speculations[j];
17627                                 buf = &spec->dtsp_buffer[i];
17628                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
17629                         }
17630                 }
17631
17632                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
17633                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
17634                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
17635                 stat.dtst_dblerrors = state->dts_dblerrors;
17636                 stat.dtst_killed =
17637                     (state->dts_activity == DTRACE_ACTIVITY_KILLED);
17638                 stat.dtst_errors = nerrs;
17639
17640                 mutex_exit(&dtrace_lock);
17641
17642                 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
17643                         return (EFAULT);
17644
17645                 return (0);
17646         }
17647
17648         case DTRACEIOC_FORMAT: {
17649                 dtrace_fmtdesc_t fmt;
17650                 char *str;
17651                 int len;
17652
17653                 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
17654                         return (EFAULT);
17655
17656                 mutex_enter(&dtrace_lock);
17657
17658                 if (fmt.dtfd_format == 0 ||
17659                     fmt.dtfd_format > state->dts_nformats) {
17660                         mutex_exit(&dtrace_lock);
17661                         return (EINVAL);
17662                 }
17663
17664                 /*
17665                  * Format strings are allocated contiguously and they are
17666                  * never freed; if a format index is less than the number
17667                  * of formats, we can assert that the format map is non-NULL
17668                  * and that the format for the specified index is non-NULL.
17669                  */
17670                 ASSERT(state->dts_formats != NULL);
17671                 str = state->dts_formats[fmt.dtfd_format - 1];
17672                 ASSERT(str != NULL);
17673
17674                 len = strlen(str) + 1;
17675
17676                 if (len > fmt.dtfd_length) {
17677                         fmt.dtfd_length = len;
17678
17679                         if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
17680                                 mutex_exit(&dtrace_lock);
17681                                 return (EINVAL);
17682                         }
17683                 } else {
17684                         if (copyout(str, fmt.dtfd_string, len) != 0) {
17685                                 mutex_exit(&dtrace_lock);
17686                                 return (EINVAL);
17687                         }
17688                 }
17689
17690                 mutex_exit(&dtrace_lock);
17691                 return (0);
17692         }
17693
17694         default:
17695                 break;
17696         }
17697
17698         return (ENOTTY);
17699 }
17700
17701 /*ARGSUSED*/
17702 static int
17703 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
17704 {
17705         dtrace_state_t *state;
17706
17707         switch (cmd) {
17708         case DDI_DETACH:
17709                 break;
17710
17711         case DDI_SUSPEND:
17712                 return (DDI_SUCCESS);
17713
17714         default:
17715                 return (DDI_FAILURE);
17716         }
17717
17718         mutex_enter(&cpu_lock);
17719         mutex_enter(&dtrace_provider_lock);
17720         mutex_enter(&dtrace_lock);
17721
17722         ASSERT(dtrace_opens == 0);
17723
17724         if (dtrace_helpers > 0) {
17725                 mutex_exit(&dtrace_provider_lock);
17726                 mutex_exit(&dtrace_lock);
17727                 mutex_exit(&cpu_lock);
17728                 return (DDI_FAILURE);
17729         }
17730
17731         if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
17732                 mutex_exit(&dtrace_provider_lock);
17733                 mutex_exit(&dtrace_lock);
17734                 mutex_exit(&cpu_lock);
17735                 return (DDI_FAILURE);
17736         }
17737
17738         dtrace_provider = NULL;
17739
17740         if ((state = dtrace_anon_grab()) != NULL) {
17741                 /*
17742                  * If there were ECBs on this state, the provider should
17743                  * have not been allowed to detach; assert that there is
17744                  * none.
17745                  */
17746                 ASSERT(state->dts_necbs == 0);
17747                 dtrace_state_destroy(state);
17748
17749                 /*
17750                  * If we're being detached with anonymous state, we need to
17751                  * indicate to the kernel debugger that DTrace is now inactive.
17752                  */
17753                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17754         }
17755
17756         bzero(&dtrace_anon, sizeof (dtrace_anon_t));
17757         unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17758         dtrace_cpu_init = NULL;
17759         dtrace_helpers_cleanup = NULL;
17760         dtrace_helpers_fork = NULL;
17761         dtrace_cpustart_init = NULL;
17762         dtrace_cpustart_fini = NULL;
17763         dtrace_debugger_init = NULL;
17764         dtrace_debugger_fini = NULL;
17765         dtrace_modload = NULL;
17766         dtrace_modunload = NULL;
17767
17768         ASSERT(dtrace_getf == 0);
17769         ASSERT(dtrace_closef == NULL);
17770
17771         mutex_exit(&cpu_lock);
17772
17773         if (dtrace_helptrace_enabled) {
17774                 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
17775                 dtrace_helptrace_buffer = NULL;
17776         }
17777
17778         kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
17779         dtrace_probes = NULL;
17780         dtrace_nprobes = 0;
17781
17782         dtrace_hash_destroy(dtrace_bymod);
17783         dtrace_hash_destroy(dtrace_byfunc);
17784         dtrace_hash_destroy(dtrace_byname);
17785         dtrace_bymod = NULL;
17786         dtrace_byfunc = NULL;
17787         dtrace_byname = NULL;
17788
17789         kmem_cache_destroy(dtrace_state_cache);
17790         vmem_destroy(dtrace_minor);
17791         vmem_destroy(dtrace_arena);
17792
17793         if (dtrace_toxrange != NULL) {
17794                 kmem_free(dtrace_toxrange,
17795                     dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
17796                 dtrace_toxrange = NULL;
17797                 dtrace_toxranges = 0;
17798                 dtrace_toxranges_max = 0;
17799         }
17800
17801         ddi_remove_minor_node(dtrace_devi, NULL);
17802         dtrace_devi = NULL;
17803
17804         ddi_soft_state_fini(&dtrace_softstate);
17805
17806         ASSERT(dtrace_vtime_references == 0);
17807         ASSERT(dtrace_opens == 0);
17808         ASSERT(dtrace_retained == NULL);
17809
17810         mutex_exit(&dtrace_lock);
17811         mutex_exit(&dtrace_provider_lock);
17812
17813         /*
17814          * We don't destroy the task queue until after we have dropped our
17815          * locks (taskq_destroy() may block on running tasks).  To prevent
17816          * attempting to do work after we have effectively detached but before
17817          * the task queue has been destroyed, all tasks dispatched via the
17818          * task queue must check that DTrace is still attached before
17819          * performing any operation.
17820          */
17821         taskq_destroy(dtrace_taskq);
17822         dtrace_taskq = NULL;
17823
17824         return (DDI_SUCCESS);
17825 }
17826 #endif
17827
17828 #if defined(sun)
17829 /*ARGSUSED*/
17830 static int
17831 dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
17832 {
17833         int error;
17834
17835         switch (infocmd) {
17836         case DDI_INFO_DEVT2DEVINFO:
17837                 *result = (void *)dtrace_devi;
17838                 error = DDI_SUCCESS;
17839                 break;
17840         case DDI_INFO_DEVT2INSTANCE:
17841                 *result = (void *)0;
17842                 error = DDI_SUCCESS;
17843                 break;
17844         default:
17845                 error = DDI_FAILURE;
17846         }
17847         return (error);
17848 }
17849 #endif
17850
17851 #if defined(sun)
17852 static struct cb_ops dtrace_cb_ops = {
17853         dtrace_open,            /* open */
17854         dtrace_close,           /* close */
17855         nulldev,                /* strategy */
17856         nulldev,                /* print */
17857         nodev,                  /* dump */
17858         nodev,                  /* read */
17859         nodev,                  /* write */
17860         dtrace_ioctl,           /* ioctl */
17861         nodev,                  /* devmap */
17862         nodev,                  /* mmap */
17863         nodev,                  /* segmap */
17864         nochpoll,               /* poll */
17865         ddi_prop_op,            /* cb_prop_op */
17866         0,                      /* streamtab  */
17867         D_NEW | D_MP            /* Driver compatibility flag */
17868 };
17869
17870 static struct dev_ops dtrace_ops = {
17871         DEVO_REV,               /* devo_rev */
17872         0,                      /* refcnt */
17873         dtrace_info,            /* get_dev_info */
17874         nulldev,                /* identify */
17875         nulldev,                /* probe */
17876         dtrace_attach,          /* attach */
17877         dtrace_detach,          /* detach */
17878         nodev,                  /* reset */
17879         &dtrace_cb_ops,         /* driver operations */
17880         NULL,                   /* bus operations */
17881         nodev                   /* dev power */
17882 };
17883
17884 static struct modldrv modldrv = {
17885         &mod_driverops,         /* module type (this is a pseudo driver) */
17886         "Dynamic Tracing",      /* name of module */
17887         &dtrace_ops,            /* driver ops */
17888 };
17889
17890 static struct modlinkage modlinkage = {
17891         MODREV_1,
17892         (void *)&modldrv,
17893         NULL
17894 };
17895
17896 int
17897 _init(void)
17898 {
17899         return (mod_install(&modlinkage));
17900 }
17901
17902 int
17903 _info(struct modinfo *modinfop)
17904 {
17905         return (mod_info(&modlinkage, modinfop));
17906 }
17907
17908 int
17909 _fini(void)
17910 {
17911         return (mod_remove(&modlinkage));
17912 }
17913 #else
17914
17915 static d_ioctl_t        dtrace_ioctl;
17916 static d_ioctl_t        dtrace_ioctl_helper;
17917 static void             dtrace_load(void *);
17918 static int              dtrace_unload(void);
17919 static struct cdev      *dtrace_dev;
17920 static struct cdev      *helper_dev;
17921
17922 void dtrace_invop_init(void);
17923 void dtrace_invop_uninit(void);
17924
17925 static struct cdevsw dtrace_cdevsw = {
17926         .d_version      = D_VERSION,
17927         .d_ioctl        = dtrace_ioctl,
17928         .d_open         = dtrace_open,
17929         .d_name         = "dtrace",
17930 };
17931
17932 static struct cdevsw helper_cdevsw = {
17933         .d_version      = D_VERSION,
17934         .d_ioctl        = dtrace_ioctl_helper,
17935         .d_name         = "helper",
17936 };
17937
17938 #include <dtrace_anon.c>
17939 #include <dtrace_ioctl.c>
17940 #include <dtrace_load.c>
17941 #include <dtrace_modevent.c>
17942 #include <dtrace_sysctl.c>
17943 #include <dtrace_unload.c>
17944 #include <dtrace_vtime.c>
17945 #include <dtrace_hacks.c>
17946 #include <dtrace_isa.c>
17947
17948 SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
17949 SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
17950 SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
17951
17952 DEV_MODULE(dtrace, dtrace_modevent, NULL);
17953 MODULE_VERSION(dtrace, 1);
17954 MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
17955 #endif