]> CyberLeo.Net >> Repos - FreeBSD/releng/10.3.git/blob - sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
- Copy stable/10@296371 to releng/10.3 in preparation for 10.3-RC1
[FreeBSD/releng/10.3.git] / sys / cddl / contrib / opensolaris / uts / common / dtrace / dtrace.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * $FreeBSD$
22  */
23
24 /*
25  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
26  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
27  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
28  */
29
30 /*
31  * DTrace - Dynamic Tracing for Solaris
32  *
33  * This is the implementation of the Solaris Dynamic Tracing framework
34  * (DTrace).  The user-visible interface to DTrace is described at length in
35  * the "Solaris Dynamic Tracing Guide".  The interfaces between the libdtrace
36  * library, the in-kernel DTrace framework, and the DTrace providers are
37  * described in the block comments in the <sys/dtrace.h> header file.  The
38  * internal architecture of DTrace is described in the block comments in the
39  * <sys/dtrace_impl.h> header file.  The comments contained within the DTrace
40  * implementation very much assume mastery of all of these sources; if one has
41  * an unanswered question about the implementation, one should consult them
42  * first.
43  *
44  * The functions here are ordered roughly as follows:
45  *
46  *   - Probe context functions
47  *   - Probe hashing functions
48  *   - Non-probe context utility functions
49  *   - Matching functions
50  *   - Provider-to-Framework API functions
51  *   - Probe management functions
52  *   - DIF object functions
53  *   - Format functions
54  *   - Predicate functions
55  *   - ECB functions
56  *   - Buffer functions
57  *   - Enabling functions
58  *   - DOF functions
59  *   - Anonymous enabling functions
60  *   - Consumer state functions
61  *   - Helper functions
62  *   - Hook functions
63  *   - Driver cookbook functions
64  *
65  * Each group of functions begins with a block comment labelled the "DTrace
66  * [Group] Functions", allowing one to find each block by searching forward
67  * on capital-f functions.
68  */
69 #include <sys/errno.h>
70 #if !defined(sun)
71 #include <sys/time.h>
72 #endif
73 #include <sys/stat.h>
74 #include <sys/modctl.h>
75 #include <sys/conf.h>
76 #include <sys/systm.h>
77 #if defined(sun)
78 #include <sys/ddi.h>
79 #include <sys/sunddi.h>
80 #endif
81 #include <sys/cpuvar.h>
82 #include <sys/kmem.h>
83 #if defined(sun)
84 #include <sys/strsubr.h>
85 #endif
86 #include <sys/sysmacros.h>
87 #include <sys/dtrace_impl.h>
88 #include <sys/atomic.h>
89 #include <sys/cmn_err.h>
90 #if defined(sun)
91 #include <sys/mutex_impl.h>
92 #include <sys/rwlock_impl.h>
93 #endif
94 #include <sys/ctf_api.h>
95 #if defined(sun)
96 #include <sys/panic.h>
97 #include <sys/priv_impl.h>
98 #endif
99 #include <sys/policy.h>
100 #if defined(sun)
101 #include <sys/cred_impl.h>
102 #include <sys/procfs_isa.h>
103 #endif
104 #include <sys/taskq.h>
105 #if defined(sun)
106 #include <sys/mkdev.h>
107 #include <sys/kdi.h>
108 #endif
109 #include <sys/zone.h>
110 #include <sys/socket.h>
111 #include <netinet/in.h>
112 #include "strtolctype.h"
113
114 /* FreeBSD includes: */
115 #if !defined(sun)
116 #include <sys/callout.h>
117 #include <sys/ctype.h>
118 #include <sys/eventhandler.h>
119 #include <sys/limits.h>
120 #include <sys/kdb.h>
121 #include <sys/kernel.h>
122 #include <sys/malloc.h>
123 #include <sys/sysctl.h>
124 #include <sys/lock.h>
125 #include <sys/mutex.h>
126 #include <sys/rwlock.h>
127 #include <sys/sx.h>
128 #include <sys/dtrace_bsd.h>
129 #include <netinet/in.h>
130 #include "dtrace_cddl.h"
131 #include "dtrace_debug.c"
132 #endif
133
134 /*
135  * DTrace Tunable Variables
136  *
137  * The following variables may be tuned by adding a line to /etc/system that
138  * includes both the name of the DTrace module ("dtrace") and the name of the
139  * variable.  For example:
140  *
141  *   set dtrace:dtrace_destructive_disallow = 1
142  *
143  * In general, the only variables that one should be tuning this way are those
144  * that affect system-wide DTrace behavior, and for which the default behavior
145  * is undesirable.  Most of these variables are tunable on a per-consumer
146  * basis using DTrace options, and need not be tuned on a system-wide basis.
147  * When tuning these variables, avoid pathological values; while some attempt
148  * is made to verify the integrity of these variables, they are not considered
149  * part of the supported interface to DTrace, and they are therefore not
150  * checked comprehensively.  Further, these variables should not be tuned
151  * dynamically via "mdb -kw" or other means; they should only be tuned via
152  * /etc/system.
153  */
154 int             dtrace_destructive_disallow = 0;
155 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
156 size_t          dtrace_difo_maxsize = (256 * 1024);
157 dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);
158 size_t          dtrace_global_maxsize = (16 * 1024);
159 size_t          dtrace_actions_max = (16 * 1024);
160 size_t          dtrace_retain_max = 1024;
161 dtrace_optval_t dtrace_helper_actions_max = 128;
162 dtrace_optval_t dtrace_helper_providers_max = 32;
163 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
164 size_t          dtrace_strsize_default = 256;
165 dtrace_optval_t dtrace_cleanrate_default = 9900990;             /* 101 hz */
166 dtrace_optval_t dtrace_cleanrate_min = 200000;                  /* 5000 hz */
167 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC;  /* 1/minute */
168 dtrace_optval_t dtrace_aggrate_default = NANOSEC;               /* 1 hz */
169 dtrace_optval_t dtrace_statusrate_default = NANOSEC;            /* 1 hz */
170 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC;  /* 6/minute */
171 dtrace_optval_t dtrace_switchrate_default = NANOSEC;            /* 1 hz */
172 dtrace_optval_t dtrace_nspec_default = 1;
173 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
174 dtrace_optval_t dtrace_stackframes_default = 20;
175 dtrace_optval_t dtrace_ustackframes_default = 20;
176 dtrace_optval_t dtrace_jstackframes_default = 50;
177 dtrace_optval_t dtrace_jstackstrsize_default = 512;
178 int             dtrace_msgdsize_max = 128;
179 hrtime_t        dtrace_chill_max = MSEC2NSEC(500);              /* 500 ms */
180 hrtime_t        dtrace_chill_interval = NANOSEC;                /* 1000 ms */
181 int             dtrace_devdepth_max = 32;
182 int             dtrace_err_verbose;
183 hrtime_t        dtrace_deadman_interval = NANOSEC;
184 hrtime_t        dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
185 hrtime_t        dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
186 hrtime_t        dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
187 #if !defined(sun)
188 int             dtrace_memstr_max = 4096;
189 #endif
190
191 /*
192  * DTrace External Variables
193  *
194  * As dtrace(7D) is a kernel module, any DTrace variables are obviously
195  * available to DTrace consumers via the backtick (`) syntax.  One of these,
196  * dtrace_zero, is made deliberately so:  it is provided as a source of
197  * well-known, zero-filled memory.  While this variable is not documented,
198  * it is used by some translators as an implementation detail.
199  */
200 const char      dtrace_zero[256] = { 0 };       /* zero-filled memory */
201
202 /*
203  * DTrace Internal Variables
204  */
205 #if defined(sun)
206 static dev_info_t       *dtrace_devi;           /* device info */
207 #endif
208 #if defined(sun)
209 static vmem_t           *dtrace_arena;          /* probe ID arena */
210 static vmem_t           *dtrace_minor;          /* minor number arena */
211 #else
212 static taskq_t          *dtrace_taskq;          /* task queue */
213 static struct unrhdr    *dtrace_arena;          /* Probe ID number.     */
214 #endif
215 static dtrace_probe_t   **dtrace_probes;        /* array of all probes */
216 static int              dtrace_nprobes;         /* number of probes */
217 static dtrace_provider_t *dtrace_provider;      /* provider list */
218 static dtrace_meta_t    *dtrace_meta_pid;       /* user-land meta provider */
219 static int              dtrace_opens;           /* number of opens */
220 static int              dtrace_helpers;         /* number of helpers */
221 static int              dtrace_getf;            /* number of unpriv getf()s */
222 #if defined(sun)
223 static void             *dtrace_softstate;      /* softstate pointer */
224 #endif
225 static dtrace_hash_t    *dtrace_bymod;          /* probes hashed by module */
226 static dtrace_hash_t    *dtrace_byfunc;         /* probes hashed by function */
227 static dtrace_hash_t    *dtrace_byname;         /* probes hashed by name */
228 static dtrace_toxrange_t *dtrace_toxrange;      /* toxic range array */
229 static int              dtrace_toxranges;       /* number of toxic ranges */
230 static int              dtrace_toxranges_max;   /* size of toxic range array */
231 static dtrace_anon_t    dtrace_anon;            /* anonymous enabling */
232 static kmem_cache_t     *dtrace_state_cache;    /* cache for dynamic state */
233 static uint64_t         dtrace_vtime_references; /* number of vtimestamp refs */
234 static kthread_t        *dtrace_panicked;       /* panicking thread */
235 static dtrace_ecb_t     *dtrace_ecb_create_cache; /* cached created ECB */
236 static dtrace_genid_t   dtrace_probegen;        /* current probe generation */
237 static dtrace_helpers_t *dtrace_deferred_pid;   /* deferred helper list */
238 static dtrace_enabling_t *dtrace_retained;      /* list of retained enablings */
239 static dtrace_genid_t   dtrace_retained_gen;    /* current retained enab gen */
240 static dtrace_dynvar_t  dtrace_dynhash_sink;    /* end of dynamic hash chains */
241 static int              dtrace_dynvar_failclean; /* dynvars failed to clean */
242 #if !defined(sun)
243 static struct mtx       dtrace_unr_mtx;
244 MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
245 int             dtrace_in_probe;        /* non-zero if executing a probe */
246 #if defined(__i386__) || defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
247 uintptr_t       dtrace_in_probe_addr;   /* Address of invop when already in probe */
248 #endif
249 static eventhandler_tag dtrace_kld_load_tag;
250 static eventhandler_tag dtrace_kld_unload_try_tag;
251 #endif
252
253 /*
254  * DTrace Locking
255  * DTrace is protected by three (relatively coarse-grained) locks:
256  *
257  * (1) dtrace_lock is required to manipulate essentially any DTrace state,
258  *     including enabling state, probes, ECBs, consumer state, helper state,
259  *     etc.  Importantly, dtrace_lock is _not_ required when in probe context;
260  *     probe context is lock-free -- synchronization is handled via the
261  *     dtrace_sync() cross call mechanism.
262  *
263  * (2) dtrace_provider_lock is required when manipulating provider state, or
264  *     when provider state must be held constant.
265  *
266  * (3) dtrace_meta_lock is required when manipulating meta provider state, or
267  *     when meta provider state must be held constant.
268  *
269  * The lock ordering between these three locks is dtrace_meta_lock before
270  * dtrace_provider_lock before dtrace_lock.  (In particular, there are
271  * several places where dtrace_provider_lock is held by the framework as it
272  * calls into the providers -- which then call back into the framework,
273  * grabbing dtrace_lock.)
274  *
275  * There are two other locks in the mix:  mod_lock and cpu_lock.  With respect
276  * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
277  * role as a coarse-grained lock; it is acquired before both of these locks.
278  * With respect to dtrace_meta_lock, its behavior is stranger:  cpu_lock must
279  * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
280  * mod_lock is similar with respect to dtrace_provider_lock in that it must be
281  * acquired _between_ dtrace_provider_lock and dtrace_lock.
282  */
283 static kmutex_t         dtrace_lock;            /* probe state lock */
284 static kmutex_t         dtrace_provider_lock;   /* provider state lock */
285 static kmutex_t         dtrace_meta_lock;       /* meta-provider state lock */
286
287 #if !defined(sun)
288 /* XXX FreeBSD hacks. */
289 #define cr_suid         cr_svuid
290 #define cr_sgid         cr_svgid
291 #define ipaddr_t        in_addr_t
292 #define mod_modname     pathname
293 #define vuprintf        vprintf
294 #define ttoproc(_a)     ((_a)->td_proc)
295 #define crgetzoneid(_a) 0
296 #define NCPU            MAXCPU
297 #define SNOCD           0
298 #define CPU_ON_INTR(_a) 0
299
300 #define PRIV_EFFECTIVE          (1 << 0)
301 #define PRIV_DTRACE_KERNEL      (1 << 1)
302 #define PRIV_DTRACE_PROC        (1 << 2)
303 #define PRIV_DTRACE_USER        (1 << 3)
304 #define PRIV_PROC_OWNER         (1 << 4)
305 #define PRIV_PROC_ZONE          (1 << 5)
306 #define PRIV_ALL                ~0
307
308 SYSCTL_DECL(_debug_dtrace);
309 SYSCTL_DECL(_kern_dtrace);
310 #endif
311
312 #if defined(sun)
313 #define curcpu  CPU->cpu_id
314 #endif
315
316
317 /*
318  * DTrace Provider Variables
319  *
320  * These are the variables relating to DTrace as a provider (that is, the
321  * provider of the BEGIN, END, and ERROR probes).
322  */
323 static dtrace_pattr_t   dtrace_provider_attr = {
324 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
325 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
326 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
327 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
328 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
329 };
330
331 static void
332 dtrace_nullop(void)
333 {}
334
335 static dtrace_pops_t    dtrace_provider_ops = {
336         (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop,
337         (void (*)(void *, modctl_t *))dtrace_nullop,
338         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
339         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
340         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
341         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
342         NULL,
343         NULL,
344         NULL,
345         (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
346 };
347
348 static dtrace_id_t      dtrace_probeid_begin;   /* special BEGIN probe */
349 static dtrace_id_t      dtrace_probeid_end;     /* special END probe */
350 dtrace_id_t             dtrace_probeid_error;   /* special ERROR probe */
351
352 /*
353  * DTrace Helper Tracing Variables
354  *
355  * These variables should be set dynamically to enable helper tracing.  The
356  * only variables that should be set are dtrace_helptrace_enable (which should
357  * be set to a non-zero value to allocate helper tracing buffers on the next
358  * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
359  * non-zero value to deallocate helper tracing buffers on the next close of
360  * /dev/dtrace).  When (and only when) helper tracing is disabled, the
361  * buffer size may also be set via dtrace_helptrace_bufsize.
362  */
363 int                     dtrace_helptrace_enable = 0;
364 int                     dtrace_helptrace_disable = 0;
365 int                     dtrace_helptrace_bufsize = 16 * 1024 * 1024;
366 uint32_t                dtrace_helptrace_nlocals;
367 static dtrace_helptrace_t *dtrace_helptrace_buffer;
368 static uint32_t         dtrace_helptrace_next = 0;
369 static int              dtrace_helptrace_wrapped = 0;
370
371 /*
372  * DTrace Error Hashing
373  *
374  * On DEBUG kernels, DTrace will track the errors that has seen in a hash
375  * table.  This is very useful for checking coverage of tests that are
376  * expected to induce DIF or DOF processing errors, and may be useful for
377  * debugging problems in the DIF code generator or in DOF generation .  The
378  * error hash may be examined with the ::dtrace_errhash MDB dcmd.
379  */
380 #ifdef DEBUG
381 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
382 static const char *dtrace_errlast;
383 static kthread_t *dtrace_errthread;
384 static kmutex_t dtrace_errlock;
385 #endif
386
387 /*
388  * DTrace Macros and Constants
389  *
390  * These are various macros that are useful in various spots in the
391  * implementation, along with a few random constants that have no meaning
392  * outside of the implementation.  There is no real structure to this cpp
393  * mishmash -- but is there ever?
394  */
395 #define DTRACE_HASHSTR(hash, probe)     \
396         dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
397
398 #define DTRACE_HASHNEXT(hash, probe)    \
399         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
400
401 #define DTRACE_HASHPREV(hash, probe)    \
402         (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
403
404 #define DTRACE_HASHEQ(hash, lhs, rhs)   \
405         (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
406             *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
407
408 #define DTRACE_AGGHASHSIZE_SLEW         17
409
410 #define DTRACE_V4MAPPED_OFFSET          (sizeof (uint32_t) * 3)
411
412 /*
413  * The key for a thread-local variable consists of the lower 61 bits of the
414  * t_did, plus the 3 bits of the highest active interrupt above LOCK_LEVEL.
415  * We add DIF_VARIABLE_MAX to t_did to assure that the thread key is never
416  * equal to a variable identifier.  This is necessary (but not sufficient) to
417  * assure that global associative arrays never collide with thread-local
418  * variables.  To guarantee that they cannot collide, we must also define the
419  * order for keying dynamic variables.  That order is:
420  *
421  *   [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
422  *
423  * Because the variable-key and the tls-key are in orthogonal spaces, there is
424  * no way for a global variable key signature to match a thread-local key
425  * signature.
426  */
427 #if defined(sun)
428 #define DTRACE_TLS_THRKEY(where) { \
429         uint_t intr = 0; \
430         uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
431         for (; actv; actv >>= 1) \
432                 intr++; \
433         ASSERT(intr < (1 << 3)); \
434         (where) = ((curthread->t_did + DIF_VARIABLE_MAX) & \
435             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
436 }
437 #else
438 #define DTRACE_TLS_THRKEY(where) { \
439         solaris_cpu_t *_c = &solaris_cpu[curcpu]; \
440         uint_t intr = 0; \
441         uint_t actv = _c->cpu_intr_actv; \
442         for (; actv; actv >>= 1) \
443                 intr++; \
444         ASSERT(intr < (1 << 3)); \
445         (where) = ((curthread->td_tid + DIF_VARIABLE_MAX) & \
446             (((uint64_t)1 << 61) - 1)) | ((uint64_t)intr << 61); \
447 }
448 #endif
449
450 #define DT_BSWAP_8(x)   ((x) & 0xff)
451 #define DT_BSWAP_16(x)  ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
452 #define DT_BSWAP_32(x)  ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
453 #define DT_BSWAP_64(x)  ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
454
455 #define DT_MASK_LO 0x00000000FFFFFFFFULL
456
457 #define DTRACE_STORE(type, tomax, offset, what) \
458         *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
459
460 #ifndef __x86
461 #define DTRACE_ALIGNCHECK(addr, size, flags)                            \
462         if (addr & (size - 1)) {                                        \
463                 *flags |= CPU_DTRACE_BADALIGN;                          \
464                 cpu_core[curcpu].cpuc_dtrace_illval = addr;     \
465                 return (0);                                             \
466         }
467 #else
468 #define DTRACE_ALIGNCHECK(addr, size, flags)
469 #endif
470
471 /*
472  * Test whether a range of memory starting at testaddr of size testsz falls
473  * within the range of memory described by addr, sz.  We take care to avoid
474  * problems with overflow and underflow of the unsigned quantities, and
475  * disallow all negative sizes.  Ranges of size 0 are allowed.
476  */
477 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
478         ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
479         (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
480         (testaddr) + (testsz) >= (testaddr))
481
482 /*
483  * Test whether alloc_sz bytes will fit in the scratch region.  We isolate
484  * alloc_sz on the righthand side of the comparison in order to avoid overflow
485  * or underflow in the comparison with it.  This is simpler than the INRANGE
486  * check above, because we know that the dtms_scratch_ptr is valid in the
487  * range.  Allocations of size zero are allowed.
488  */
489 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
490         ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
491         (mstate)->dtms_scratch_ptr >= (alloc_sz))
492
493 #define DTRACE_LOADFUNC(bits)                                           \
494 /*CSTYLED*/                                                             \
495 uint##bits##_t                                                          \
496 dtrace_load##bits(uintptr_t addr)                                       \
497 {                                                                       \
498         size_t size = bits / NBBY;                                      \
499         /*CSTYLED*/                                                     \
500         uint##bits##_t rval;                                            \
501         int i;                                                          \
502         volatile uint16_t *flags = (volatile uint16_t *)                \
503             &cpu_core[curcpu].cpuc_dtrace_flags;                        \
504                                                                         \
505         DTRACE_ALIGNCHECK(addr, size, flags);                           \
506                                                                         \
507         for (i = 0; i < dtrace_toxranges; i++) {                        \
508                 if (addr >= dtrace_toxrange[i].dtt_limit)               \
509                         continue;                                       \
510                                                                         \
511                 if (addr + size <= dtrace_toxrange[i].dtt_base)         \
512                         continue;                                       \
513                                                                         \
514                 /*                                                      \
515                  * This address falls within a toxic region; return 0.  \
516                  */                                                     \
517                 *flags |= CPU_DTRACE_BADADDR;                           \
518                 cpu_core[curcpu].cpuc_dtrace_illval = addr;             \
519                 return (0);                                             \
520         }                                                               \
521                                                                         \
522         *flags |= CPU_DTRACE_NOFAULT;                                   \
523         /*CSTYLED*/                                                     \
524         rval = *((volatile uint##bits##_t *)addr);                      \
525         *flags &= ~CPU_DTRACE_NOFAULT;                                  \
526                                                                         \
527         return (!(*flags & CPU_DTRACE_FAULT) ? rval : 0);               \
528 }
529
530 #ifdef _LP64
531 #define dtrace_loadptr  dtrace_load64
532 #else
533 #define dtrace_loadptr  dtrace_load32
534 #endif
535
536 #define DTRACE_DYNHASH_FREE     0
537 #define DTRACE_DYNHASH_SINK     1
538 #define DTRACE_DYNHASH_VALID    2
539
540 #define DTRACE_MATCH_NEXT       0
541 #define DTRACE_MATCH_DONE       1
542 #define DTRACE_ANCHORED(probe)  ((probe)->dtpr_func[0] != '\0')
543 #define DTRACE_STATE_ALIGN      64
544
545 #define DTRACE_FLAGS2FLT(flags)                                         \
546         (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR :           \
547         ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP :                \
548         ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO :            \
549         ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV :                \
550         ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV :                \
551         ((flags) & CPU_DTRACE_TUPOFLOW) ?  DTRACEFLT_TUPOFLOW :         \
552         ((flags) & CPU_DTRACE_BADALIGN) ?  DTRACEFLT_BADALIGN :         \
553         ((flags) & CPU_DTRACE_NOSCRATCH) ?  DTRACEFLT_NOSCRATCH :       \
554         ((flags) & CPU_DTRACE_BADSTACK) ?  DTRACEFLT_BADSTACK :         \
555         DTRACEFLT_UNKNOWN)
556
557 #define DTRACEACT_ISSTRING(act)                                         \
558         ((act)->dta_kind == DTRACEACT_DIFEXPR &&                        \
559         (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
560
561 /* Function prototype definitions: */
562 static size_t dtrace_strlen(const char *, size_t);
563 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
564 static void dtrace_enabling_provide(dtrace_provider_t *);
565 static int dtrace_enabling_match(dtrace_enabling_t *, int *);
566 static void dtrace_enabling_matchall(void);
567 static void dtrace_enabling_reap(void);
568 static dtrace_state_t *dtrace_anon_grab(void);
569 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
570     dtrace_state_t *, uint64_t, uint64_t);
571 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
572 static void dtrace_buffer_drop(dtrace_buffer_t *);
573 static int dtrace_buffer_consumed(dtrace_buffer_t *, hrtime_t when);
574 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
575     dtrace_state_t *, dtrace_mstate_t *);
576 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
577     dtrace_optval_t);
578 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *);
579 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
580 uint16_t dtrace_load16(uintptr_t);
581 uint32_t dtrace_load32(uintptr_t);
582 uint64_t dtrace_load64(uintptr_t);
583 uint8_t dtrace_load8(uintptr_t);
584 void dtrace_dynvar_clean(dtrace_dstate_t *);
585 dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
586     size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
587 uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
588 static int dtrace_priv_proc(dtrace_state_t *);
589 static void dtrace_getf_barrier(void);
590
591 /*
592  * DTrace Probe Context Functions
593  *
594  * These functions are called from probe context.  Because probe context is
595  * any context in which C may be called, arbitrarily locks may be held,
596  * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
597  * As a result, functions called from probe context may only call other DTrace
598  * support functions -- they may not interact at all with the system at large.
599  * (Note that the ASSERT macro is made probe-context safe by redefining it in
600  * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
601  * loads are to be performed from probe context, they _must_ be in terms of
602  * the safe dtrace_load*() variants.
603  *
604  * Some functions in this block are not actually called from probe context;
605  * for these functions, there will be a comment above the function reading
606  * "Note:  not called from probe context."
607  */
608 void
609 dtrace_panic(const char *format, ...)
610 {
611         va_list alist;
612
613         va_start(alist, format);
614 #ifdef __FreeBSD__
615         vpanic(format, alist);
616 #else
617         dtrace_vpanic(format, alist);
618 #endif
619         va_end(alist);
620 }
621
622 int
623 dtrace_assfail(const char *a, const char *f, int l)
624 {
625         dtrace_panic("assertion failed: %s, file: %s, line: %d", a, f, l);
626
627         /*
628          * We just need something here that even the most clever compiler
629          * cannot optimize away.
630          */
631         return (a[(uintptr_t)f]);
632 }
633
634 /*
635  * Atomically increment a specified error counter from probe context.
636  */
637 static void
638 dtrace_error(uint32_t *counter)
639 {
640         /*
641          * Most counters stored to in probe context are per-CPU counters.
642          * However, there are some error conditions that are sufficiently
643          * arcane that they don't merit per-CPU storage.  If these counters
644          * are incremented concurrently on different CPUs, scalability will be
645          * adversely affected -- but we don't expect them to be white-hot in a
646          * correctly constructed enabling...
647          */
648         uint32_t oval, nval;
649
650         do {
651                 oval = *counter;
652
653                 if ((nval = oval + 1) == 0) {
654                         /*
655                          * If the counter would wrap, set it to 1 -- assuring
656                          * that the counter is never zero when we have seen
657                          * errors.  (The counter must be 32-bits because we
658                          * aren't guaranteed a 64-bit compare&swap operation.)
659                          * To save this code both the infamy of being fingered
660                          * by a priggish news story and the indignity of being
661                          * the target of a neo-puritan witch trial, we're
662                          * carefully avoiding any colorful description of the
663                          * likelihood of this condition -- but suffice it to
664                          * say that it is only slightly more likely than the
665                          * overflow of predicate cache IDs, as discussed in
666                          * dtrace_predicate_create().
667                          */
668                         nval = 1;
669                 }
670         } while (dtrace_cas32(counter, oval, nval) != oval);
671 }
672
673 /*
674  * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
675  * uint8_t, a uint16_t, a uint32_t and a uint64_t.
676  */
677 DTRACE_LOADFUNC(8)
678 DTRACE_LOADFUNC(16)
679 DTRACE_LOADFUNC(32)
680 DTRACE_LOADFUNC(64)
681
682 static int
683 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
684 {
685         if (dest < mstate->dtms_scratch_base)
686                 return (0);
687
688         if (dest + size < dest)
689                 return (0);
690
691         if (dest + size > mstate->dtms_scratch_ptr)
692                 return (0);
693
694         return (1);
695 }
696
697 static int
698 dtrace_canstore_statvar(uint64_t addr, size_t sz,
699     dtrace_statvar_t **svars, int nsvars)
700 {
701         int i;
702
703         for (i = 0; i < nsvars; i++) {
704                 dtrace_statvar_t *svar = svars[i];
705
706                 if (svar == NULL || svar->dtsv_size == 0)
707                         continue;
708
709                 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
710                         return (1);
711         }
712
713         return (0);
714 }
715
716 /*
717  * Check to see if the address is within a memory region to which a store may
718  * be issued.  This includes the DTrace scratch areas, and any DTrace variable
719  * region.  The caller of dtrace_canstore() is responsible for performing any
720  * alignment checks that are needed before stores are actually executed.
721  */
722 static int
723 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
724     dtrace_vstate_t *vstate)
725 {
726         /*
727          * First, check to see if the address is in scratch space...
728          */
729         if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
730             mstate->dtms_scratch_size))
731                 return (1);
732
733         /*
734          * Now check to see if it's a dynamic variable.  This check will pick
735          * up both thread-local variables and any global dynamically-allocated
736          * variables.
737          */
738         if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
739             vstate->dtvs_dynvars.dtds_size)) {
740                 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
741                 uintptr_t base = (uintptr_t)dstate->dtds_base +
742                     (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
743                 uintptr_t chunkoffs;
744
745                 /*
746                  * Before we assume that we can store here, we need to make
747                  * sure that it isn't in our metadata -- storing to our
748                  * dynamic variable metadata would corrupt our state.  For
749                  * the range to not include any dynamic variable metadata,
750                  * it must:
751                  *
752                  *      (1) Start above the hash table that is at the base of
753                  *      the dynamic variable space
754                  *
755                  *      (2) Have a starting chunk offset that is beyond the
756                  *      dtrace_dynvar_t that is at the base of every chunk
757                  *
758                  *      (3) Not span a chunk boundary
759                  *
760                  */
761                 if (addr < base)
762                         return (0);
763
764                 chunkoffs = (addr - base) % dstate->dtds_chunksize;
765
766                 if (chunkoffs < sizeof (dtrace_dynvar_t))
767                         return (0);
768
769                 if (chunkoffs + sz > dstate->dtds_chunksize)
770                         return (0);
771
772                 return (1);
773         }
774
775         /*
776          * Finally, check the static local and global variables.  These checks
777          * take the longest, so we perform them last.
778          */
779         if (dtrace_canstore_statvar(addr, sz,
780             vstate->dtvs_locals, vstate->dtvs_nlocals))
781                 return (1);
782
783         if (dtrace_canstore_statvar(addr, sz,
784             vstate->dtvs_globals, vstate->dtvs_nglobals))
785                 return (1);
786
787         return (0);
788 }
789
790
791 /*
792  * Convenience routine to check to see if the address is within a memory
793  * region in which a load may be issued given the user's privilege level;
794  * if not, it sets the appropriate error flags and loads 'addr' into the
795  * illegal value slot.
796  *
797  * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
798  * appropriate memory access protection.
799  */
800 static int
801 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
802     dtrace_vstate_t *vstate)
803 {
804         volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
805         file_t *fp;
806
807         /*
808          * If we hold the privilege to read from kernel memory, then
809          * everything is readable.
810          */
811         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
812                 return (1);
813
814         /*
815          * You can obviously read that which you can store.
816          */
817         if (dtrace_canstore(addr, sz, mstate, vstate))
818                 return (1);
819
820         /*
821          * We're allowed to read from our own string table.
822          */
823         if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
824             mstate->dtms_difo->dtdo_strlen))
825                 return (1);
826
827         if (vstate->dtvs_state != NULL &&
828             dtrace_priv_proc(vstate->dtvs_state)) {
829                 proc_t *p;
830
831                 /*
832                  * When we have privileges to the current process, there are
833                  * several context-related kernel structures that are safe to
834                  * read, even absent the privilege to read from kernel memory.
835                  * These reads are safe because these structures contain only
836                  * state that (1) we're permitted to read, (2) is harmless or
837                  * (3) contains pointers to additional kernel state that we're
838                  * not permitted to read (and as such, do not present an
839                  * opportunity for privilege escalation).  Finally (and
840                  * critically), because of the nature of their relation with
841                  * the current thread context, the memory associated with these
842                  * structures cannot change over the duration of probe context,
843                  * and it is therefore impossible for this memory to be
844                  * deallocated and reallocated as something else while it's
845                  * being operated upon.
846                  */
847                 if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
848                         return (1);
849
850                 if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
851                     sz, curthread->t_procp, sizeof (proc_t))) {
852                         return (1);
853                 }
854
855                 if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
856                     curthread->t_cred, sizeof (cred_t))) {
857                         return (1);
858                 }
859
860 #if defined(sun)
861                 if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
862                     &(p->p_pidp->pid_id), sizeof (pid_t))) {
863                         return (1);
864                 }
865
866                 if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
867                     curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
868                         return (1);
869                 }
870 #endif
871         }
872
873         if ((fp = mstate->dtms_getf) != NULL) {
874                 uintptr_t psz = sizeof (void *);
875                 vnode_t *vp;
876                 vnodeops_t *op;
877
878                 /*
879                  * When getf() returns a file_t, the enabling is implicitly
880                  * granted the (transient) right to read the returned file_t
881                  * as well as the v_path and v_op->vnop_name of the underlying
882                  * vnode.  These accesses are allowed after a successful
883                  * getf() because the members that they refer to cannot change
884                  * once set -- and the barrier logic in the kernel's closef()
885                  * path assures that the file_t and its referenced vode_t
886                  * cannot themselves be stale (that is, it impossible for
887                  * either dtms_getf itself or its f_vnode member to reference
888                  * freed memory).
889                  */
890                 if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
891                         return (1);
892
893                 if ((vp = fp->f_vnode) != NULL) {
894 #if defined(sun)
895                         if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
896                                 return (1);
897                         if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
898                             vp->v_path, strlen(vp->v_path) + 1)) {
899                                 return (1);
900                         }
901 #endif
902
903                         if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
904                                 return (1);
905
906 #if defined(sun)
907                         if ((op = vp->v_op) != NULL &&
908                             DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
909                                 return (1);
910                         }
911
912                         if (op != NULL && op->vnop_name != NULL &&
913                             DTRACE_INRANGE(addr, sz, op->vnop_name,
914                             strlen(op->vnop_name) + 1)) {
915                                 return (1);
916                         }
917 #endif
918                 }
919         }
920
921         DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
922         *illval = addr;
923         return (0);
924 }
925
926 /*
927  * Convenience routine to check to see if a given string is within a memory
928  * region in which a load may be issued given the user's privilege level;
929  * this exists so that we don't need to issue unnecessary dtrace_strlen()
930  * calls in the event that the user has all privileges.
931  */
932 static int
933 dtrace_strcanload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
934     dtrace_vstate_t *vstate)
935 {
936         size_t strsz;
937
938         /*
939          * If we hold the privilege to read from kernel memory, then
940          * everything is readable.
941          */
942         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
943                 return (1);
944
945         strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr, sz);
946         if (dtrace_canload(addr, strsz, mstate, vstate))
947                 return (1);
948
949         return (0);
950 }
951
952 /*
953  * Convenience routine to check to see if a given variable is within a memory
954  * region in which a load may be issued given the user's privilege level.
955  */
956 static int
957 dtrace_vcanload(void *src, dtrace_diftype_t *type, dtrace_mstate_t *mstate,
958     dtrace_vstate_t *vstate)
959 {
960         size_t sz;
961         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
962
963         /*
964          * If we hold the privilege to read from kernel memory, then
965          * everything is readable.
966          */
967         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
968                 return (1);
969
970         if (type->dtdt_kind == DIF_TYPE_STRING)
971                 sz = dtrace_strlen(src,
972                     vstate->dtvs_state->dts_options[DTRACEOPT_STRSIZE]) + 1;
973         else
974                 sz = type->dtdt_size;
975
976         return (dtrace_canload((uintptr_t)src, sz, mstate, vstate));
977 }
978
979 /*
980  * Convert a string to a signed integer using safe loads.
981  *
982  * NOTE: This function uses various macros from strtolctype.h to manipulate
983  * digit values, etc -- these have all been checked to ensure they make
984  * no additional function calls.
985  */
986 static int64_t
987 dtrace_strtoll(char *input, int base, size_t limit)
988 {
989         uintptr_t pos = (uintptr_t)input;
990         int64_t val = 0;
991         int x;
992         boolean_t neg = B_FALSE;
993         char c, cc, ccc;
994         uintptr_t end = pos + limit;
995
996         /*
997          * Consume any whitespace preceding digits.
998          */
999         while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1000                 pos++;
1001
1002         /*
1003          * Handle an explicit sign if one is present.
1004          */
1005         if (c == '-' || c == '+') {
1006                 if (c == '-')
1007                         neg = B_TRUE;
1008                 c = dtrace_load8(++pos);
1009         }
1010
1011         /*
1012          * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1013          * if present.
1014          */
1015         if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1016             cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1017                 pos += 2;
1018                 c = ccc;
1019         }
1020
1021         /*
1022          * Read in contiguous digits until the first non-digit character.
1023          */
1024         for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1025             c = dtrace_load8(++pos))
1026                 val = val * base + x;
1027
1028         return (neg ? -val : val);
1029 }
1030
1031 /*
1032  * Compare two strings using safe loads.
1033  */
1034 static int
1035 dtrace_strncmp(char *s1, char *s2, size_t limit)
1036 {
1037         uint8_t c1, c2;
1038         volatile uint16_t *flags;
1039
1040         if (s1 == s2 || limit == 0)
1041                 return (0);
1042
1043         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1044
1045         do {
1046                 if (s1 == NULL) {
1047                         c1 = '\0';
1048                 } else {
1049                         c1 = dtrace_load8((uintptr_t)s1++);
1050                 }
1051
1052                 if (s2 == NULL) {
1053                         c2 = '\0';
1054                 } else {
1055                         c2 = dtrace_load8((uintptr_t)s2++);
1056                 }
1057
1058                 if (c1 != c2)
1059                         return (c1 - c2);
1060         } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1061
1062         return (0);
1063 }
1064
1065 /*
1066  * Compute strlen(s) for a string using safe memory accesses.  The additional
1067  * len parameter is used to specify a maximum length to ensure completion.
1068  */
1069 static size_t
1070 dtrace_strlen(const char *s, size_t lim)
1071 {
1072         uint_t len;
1073
1074         for (len = 0; len != lim; len++) {
1075                 if (dtrace_load8((uintptr_t)s++) == '\0')
1076                         break;
1077         }
1078
1079         return (len);
1080 }
1081
1082 /*
1083  * Check if an address falls within a toxic region.
1084  */
1085 static int
1086 dtrace_istoxic(uintptr_t kaddr, size_t size)
1087 {
1088         uintptr_t taddr, tsize;
1089         int i;
1090
1091         for (i = 0; i < dtrace_toxranges; i++) {
1092                 taddr = dtrace_toxrange[i].dtt_base;
1093                 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1094
1095                 if (kaddr - taddr < tsize) {
1096                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1097                         cpu_core[curcpu].cpuc_dtrace_illval = kaddr;
1098                         return (1);
1099                 }
1100
1101                 if (taddr - kaddr < size) {
1102                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1103                         cpu_core[curcpu].cpuc_dtrace_illval = taddr;
1104                         return (1);
1105                 }
1106         }
1107
1108         return (0);
1109 }
1110
1111 /*
1112  * Copy src to dst using safe memory accesses.  The src is assumed to be unsafe
1113  * memory specified by the DIF program.  The dst is assumed to be safe memory
1114  * that we can store to directly because it is managed by DTrace.  As with
1115  * standard bcopy, overlapping copies are handled properly.
1116  */
1117 static void
1118 dtrace_bcopy(const void *src, void *dst, size_t len)
1119 {
1120         if (len != 0) {
1121                 uint8_t *s1 = dst;
1122                 const uint8_t *s2 = src;
1123
1124                 if (s1 <= s2) {
1125                         do {
1126                                 *s1++ = dtrace_load8((uintptr_t)s2++);
1127                         } while (--len != 0);
1128                 } else {
1129                         s2 += len;
1130                         s1 += len;
1131
1132                         do {
1133                                 *--s1 = dtrace_load8((uintptr_t)--s2);
1134                         } while (--len != 0);
1135                 }
1136         }
1137 }
1138
1139 /*
1140  * Copy src to dst using safe memory accesses, up to either the specified
1141  * length, or the point that a nul byte is encountered.  The src is assumed to
1142  * be unsafe memory specified by the DIF program.  The dst is assumed to be
1143  * safe memory that we can store to directly because it is managed by DTrace.
1144  * Unlike dtrace_bcopy(), overlapping regions are not handled.
1145  */
1146 static void
1147 dtrace_strcpy(const void *src, void *dst, size_t len)
1148 {
1149         if (len != 0) {
1150                 uint8_t *s1 = dst, c;
1151                 const uint8_t *s2 = src;
1152
1153                 do {
1154                         *s1++ = c = dtrace_load8((uintptr_t)s2++);
1155                 } while (--len != 0 && c != '\0');
1156         }
1157 }
1158
1159 /*
1160  * Copy src to dst, deriving the size and type from the specified (BYREF)
1161  * variable type.  The src is assumed to be unsafe memory specified by the DIF
1162  * program.  The dst is assumed to be DTrace variable memory that is of the
1163  * specified type; we assume that we can store to directly.
1164  */
1165 static void
1166 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type)
1167 {
1168         ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1169
1170         if (type->dtdt_kind == DIF_TYPE_STRING) {
1171                 dtrace_strcpy(src, dst, type->dtdt_size);
1172         } else {
1173                 dtrace_bcopy(src, dst, type->dtdt_size);
1174         }
1175 }
1176
1177 /*
1178  * Compare s1 to s2 using safe memory accesses.  The s1 data is assumed to be
1179  * unsafe memory specified by the DIF program.  The s2 data is assumed to be
1180  * safe memory that we can access directly because it is managed by DTrace.
1181  */
1182 static int
1183 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1184 {
1185         volatile uint16_t *flags;
1186
1187         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
1188
1189         if (s1 == s2)
1190                 return (0);
1191
1192         if (s1 == NULL || s2 == NULL)
1193                 return (1);
1194
1195         if (s1 != s2 && len != 0) {
1196                 const uint8_t *ps1 = s1;
1197                 const uint8_t *ps2 = s2;
1198
1199                 do {
1200                         if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1201                                 return (1);
1202                 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1203         }
1204         return (0);
1205 }
1206
1207 /*
1208  * Zero the specified region using a simple byte-by-byte loop.  Note that this
1209  * is for safe DTrace-managed memory only.
1210  */
1211 static void
1212 dtrace_bzero(void *dst, size_t len)
1213 {
1214         uchar_t *cp;
1215
1216         for (cp = dst; len != 0; len--)
1217                 *cp++ = 0;
1218 }
1219
1220 static void
1221 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1222 {
1223         uint64_t result[2];
1224
1225         result[0] = addend1[0] + addend2[0];
1226         result[1] = addend1[1] + addend2[1] +
1227             (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1228
1229         sum[0] = result[0];
1230         sum[1] = result[1];
1231 }
1232
1233 /*
1234  * Shift the 128-bit value in a by b. If b is positive, shift left.
1235  * If b is negative, shift right.
1236  */
1237 static void
1238 dtrace_shift_128(uint64_t *a, int b)
1239 {
1240         uint64_t mask;
1241
1242         if (b == 0)
1243                 return;
1244
1245         if (b < 0) {
1246                 b = -b;
1247                 if (b >= 64) {
1248                         a[0] = a[1] >> (b - 64);
1249                         a[1] = 0;
1250                 } else {
1251                         a[0] >>= b;
1252                         mask = 1LL << (64 - b);
1253                         mask -= 1;
1254                         a[0] |= ((a[1] & mask) << (64 - b));
1255                         a[1] >>= b;
1256                 }
1257         } else {
1258                 if (b >= 64) {
1259                         a[1] = a[0] << (b - 64);
1260                         a[0] = 0;
1261                 } else {
1262                         a[1] <<= b;
1263                         mask = a[0] >> (64 - b);
1264                         a[1] |= mask;
1265                         a[0] <<= b;
1266                 }
1267         }
1268 }
1269
1270 /*
1271  * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1272  * use native multiplication on those, and then re-combine into the
1273  * resulting 128-bit value.
1274  *
1275  * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1276  *     hi1 * hi2 << 64 +
1277  *     hi1 * lo2 << 32 +
1278  *     hi2 * lo1 << 32 +
1279  *     lo1 * lo2
1280  */
1281 static void
1282 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1283 {
1284         uint64_t hi1, hi2, lo1, lo2;
1285         uint64_t tmp[2];
1286
1287         hi1 = factor1 >> 32;
1288         hi2 = factor2 >> 32;
1289
1290         lo1 = factor1 & DT_MASK_LO;
1291         lo2 = factor2 & DT_MASK_LO;
1292
1293         product[0] = lo1 * lo2;
1294         product[1] = hi1 * hi2;
1295
1296         tmp[0] = hi1 * lo2;
1297         tmp[1] = 0;
1298         dtrace_shift_128(tmp, 32);
1299         dtrace_add_128(product, tmp, product);
1300
1301         tmp[0] = hi2 * lo1;
1302         tmp[1] = 0;
1303         dtrace_shift_128(tmp, 32);
1304         dtrace_add_128(product, tmp, product);
1305 }
1306
1307 /*
1308  * This privilege check should be used by actions and subroutines to
1309  * verify that the user credentials of the process that enabled the
1310  * invoking ECB match the target credentials
1311  */
1312 static int
1313 dtrace_priv_proc_common_user(dtrace_state_t *state)
1314 {
1315         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1316
1317         /*
1318          * We should always have a non-NULL state cred here, since if cred
1319          * is null (anonymous tracing), we fast-path bypass this routine.
1320          */
1321         ASSERT(s_cr != NULL);
1322
1323         if ((cr = CRED()) != NULL &&
1324             s_cr->cr_uid == cr->cr_uid &&
1325             s_cr->cr_uid == cr->cr_ruid &&
1326             s_cr->cr_uid == cr->cr_suid &&
1327             s_cr->cr_gid == cr->cr_gid &&
1328             s_cr->cr_gid == cr->cr_rgid &&
1329             s_cr->cr_gid == cr->cr_sgid)
1330                 return (1);
1331
1332         return (0);
1333 }
1334
1335 /*
1336  * This privilege check should be used by actions and subroutines to
1337  * verify that the zone of the process that enabled the invoking ECB
1338  * matches the target credentials
1339  */
1340 static int
1341 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1342 {
1343 #if defined(sun)
1344         cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1345
1346         /*
1347          * We should always have a non-NULL state cred here, since if cred
1348          * is null (anonymous tracing), we fast-path bypass this routine.
1349          */
1350         ASSERT(s_cr != NULL);
1351
1352         if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
1353                 return (1);
1354
1355         return (0);
1356 #else
1357         return (1);
1358 #endif
1359 }
1360
1361 /*
1362  * This privilege check should be used by actions and subroutines to
1363  * verify that the process has not setuid or changed credentials.
1364  */
1365 static int
1366 dtrace_priv_proc_common_nocd(void)
1367 {
1368         proc_t *proc;
1369
1370         if ((proc = ttoproc(curthread)) != NULL &&
1371             !(proc->p_flag & SNOCD))
1372                 return (1);
1373
1374         return (0);
1375 }
1376
1377 static int
1378 dtrace_priv_proc_destructive(dtrace_state_t *state)
1379 {
1380         int action = state->dts_cred.dcr_action;
1381
1382         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1383             dtrace_priv_proc_common_zone(state) == 0)
1384                 goto bad;
1385
1386         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1387             dtrace_priv_proc_common_user(state) == 0)
1388                 goto bad;
1389
1390         if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1391             dtrace_priv_proc_common_nocd() == 0)
1392                 goto bad;
1393
1394         return (1);
1395
1396 bad:
1397         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1398
1399         return (0);
1400 }
1401
1402 static int
1403 dtrace_priv_proc_control(dtrace_state_t *state)
1404 {
1405         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1406                 return (1);
1407
1408         if (dtrace_priv_proc_common_zone(state) &&
1409             dtrace_priv_proc_common_user(state) &&
1410             dtrace_priv_proc_common_nocd())
1411                 return (1);
1412
1413         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1414
1415         return (0);
1416 }
1417
1418 static int
1419 dtrace_priv_proc(dtrace_state_t *state)
1420 {
1421         if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1422                 return (1);
1423
1424         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1425
1426         return (0);
1427 }
1428
1429 static int
1430 dtrace_priv_kernel(dtrace_state_t *state)
1431 {
1432         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1433                 return (1);
1434
1435         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1436
1437         return (0);
1438 }
1439
1440 static int
1441 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1442 {
1443         if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1444                 return (1);
1445
1446         cpu_core[curcpu].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1447
1448         return (0);
1449 }
1450
1451 /*
1452  * Determine if the dte_cond of the specified ECB allows for processing of
1453  * the current probe to continue.  Note that this routine may allow continued
1454  * processing, but with access(es) stripped from the mstate's dtms_access
1455  * field.
1456  */
1457 static int
1458 dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
1459     dtrace_ecb_t *ecb)
1460 {
1461         dtrace_probe_t *probe = ecb->dte_probe;
1462         dtrace_provider_t *prov = probe->dtpr_provider;
1463         dtrace_pops_t *pops = &prov->dtpv_pops;
1464         int mode = DTRACE_MODE_NOPRIV_DROP;
1465
1466         ASSERT(ecb->dte_cond);
1467
1468 #if defined(sun)
1469         if (pops->dtps_mode != NULL) {
1470                 mode = pops->dtps_mode(prov->dtpv_arg,
1471                     probe->dtpr_id, probe->dtpr_arg);
1472
1473                 ASSERT((mode & DTRACE_MODE_USER) ||
1474                     (mode & DTRACE_MODE_KERNEL));
1475                 ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
1476                     (mode & DTRACE_MODE_NOPRIV_DROP));
1477         }
1478
1479         /*
1480          * If the dte_cond bits indicate that this consumer is only allowed to
1481          * see user-mode firings of this probe, call the provider's dtps_mode()
1482          * entry point to check that the probe was fired while in a user
1483          * context.  If that's not the case, use the policy specified by the
1484          * provider to determine if we drop the probe or merely restrict
1485          * operation.
1486          */
1487         if (ecb->dte_cond & DTRACE_COND_USERMODE) {
1488                 ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
1489
1490                 if (!(mode & DTRACE_MODE_USER)) {
1491                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1492                                 return (0);
1493
1494                         mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
1495                 }
1496         }
1497 #endif
1498
1499         /*
1500          * This is more subtle than it looks. We have to be absolutely certain
1501          * that CRED() isn't going to change out from under us so it's only
1502          * legit to examine that structure if we're in constrained situations.
1503          * Currently, the only times we'll this check is if a non-super-user
1504          * has enabled the profile or syscall providers -- providers that
1505          * allow visibility of all processes. For the profile case, the check
1506          * above will ensure that we're examining a user context.
1507          */
1508         if (ecb->dte_cond & DTRACE_COND_OWNER) {
1509                 cred_t *cr;
1510                 cred_t *s_cr = state->dts_cred.dcr_cred;
1511                 proc_t *proc;
1512
1513                 ASSERT(s_cr != NULL);
1514
1515                 if ((cr = CRED()) == NULL ||
1516                     s_cr->cr_uid != cr->cr_uid ||
1517                     s_cr->cr_uid != cr->cr_ruid ||
1518                     s_cr->cr_uid != cr->cr_suid ||
1519                     s_cr->cr_gid != cr->cr_gid ||
1520                     s_cr->cr_gid != cr->cr_rgid ||
1521                     s_cr->cr_gid != cr->cr_sgid ||
1522                     (proc = ttoproc(curthread)) == NULL ||
1523                     (proc->p_flag & SNOCD)) {
1524                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1525                                 return (0);
1526
1527 #if defined(sun)
1528                         mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
1529 #endif
1530                 }
1531         }
1532
1533 #if defined(sun)
1534         /*
1535          * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
1536          * in our zone, check to see if our mode policy is to restrict rather
1537          * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
1538          * and DTRACE_ACCESS_ARGS
1539          */
1540         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
1541                 cred_t *cr;
1542                 cred_t *s_cr = state->dts_cred.dcr_cred;
1543
1544                 ASSERT(s_cr != NULL);
1545
1546                 if ((cr = CRED()) == NULL ||
1547                     s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
1548                         if (mode & DTRACE_MODE_NOPRIV_DROP)
1549                                 return (0);
1550
1551                         mstate->dtms_access &=
1552                             ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
1553                 }
1554         }
1555 #endif
1556
1557         return (1);
1558 }
1559
1560 /*
1561  * Note:  not called from probe context.  This function is called
1562  * asynchronously (and at a regular interval) from outside of probe context to
1563  * clean the dirty dynamic variable lists on all CPUs.  Dynamic variable
1564  * cleaning is explained in detail in <sys/dtrace_impl.h>.
1565  */
1566 void
1567 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1568 {
1569         dtrace_dynvar_t *dirty;
1570         dtrace_dstate_percpu_t *dcpu;
1571         dtrace_dynvar_t **rinsep;
1572         int i, j, work = 0;
1573
1574         for (i = 0; i < NCPU; i++) {
1575                 dcpu = &dstate->dtds_percpu[i];
1576                 rinsep = &dcpu->dtdsc_rinsing;
1577
1578                 /*
1579                  * If the dirty list is NULL, there is no dirty work to do.
1580                  */
1581                 if (dcpu->dtdsc_dirty == NULL)
1582                         continue;
1583
1584                 if (dcpu->dtdsc_rinsing != NULL) {
1585                         /*
1586                          * If the rinsing list is non-NULL, then it is because
1587                          * this CPU was selected to accept another CPU's
1588                          * dirty list -- and since that time, dirty buffers
1589                          * have accumulated.  This is a highly unlikely
1590                          * condition, but we choose to ignore the dirty
1591                          * buffers -- they'll be picked up a future cleanse.
1592                          */
1593                         continue;
1594                 }
1595
1596                 if (dcpu->dtdsc_clean != NULL) {
1597                         /*
1598                          * If the clean list is non-NULL, then we're in a
1599                          * situation where a CPU has done deallocations (we
1600                          * have a non-NULL dirty list) but no allocations (we
1601                          * also have a non-NULL clean list).  We can't simply
1602                          * move the dirty list into the clean list on this
1603                          * CPU, yet we also don't want to allow this condition
1604                          * to persist, lest a short clean list prevent a
1605                          * massive dirty list from being cleaned (which in
1606                          * turn could lead to otherwise avoidable dynamic
1607                          * drops).  To deal with this, we look for some CPU
1608                          * with a NULL clean list, NULL dirty list, and NULL
1609                          * rinsing list -- and then we borrow this CPU to
1610                          * rinse our dirty list.
1611                          */
1612                         for (j = 0; j < NCPU; j++) {
1613                                 dtrace_dstate_percpu_t *rinser;
1614
1615                                 rinser = &dstate->dtds_percpu[j];
1616
1617                                 if (rinser->dtdsc_rinsing != NULL)
1618                                         continue;
1619
1620                                 if (rinser->dtdsc_dirty != NULL)
1621                                         continue;
1622
1623                                 if (rinser->dtdsc_clean != NULL)
1624                                         continue;
1625
1626                                 rinsep = &rinser->dtdsc_rinsing;
1627                                 break;
1628                         }
1629
1630                         if (j == NCPU) {
1631                                 /*
1632                                  * We were unable to find another CPU that
1633                                  * could accept this dirty list -- we are
1634                                  * therefore unable to clean it now.
1635                                  */
1636                                 dtrace_dynvar_failclean++;
1637                                 continue;
1638                         }
1639                 }
1640
1641                 work = 1;
1642
1643                 /*
1644                  * Atomically move the dirty list aside.
1645                  */
1646                 do {
1647                         dirty = dcpu->dtdsc_dirty;
1648
1649                         /*
1650                          * Before we zap the dirty list, set the rinsing list.
1651                          * (This allows for a potential assertion in
1652                          * dtrace_dynvar():  if a free dynamic variable appears
1653                          * on a hash chain, either the dirty list or the
1654                          * rinsing list for some CPU must be non-NULL.)
1655                          */
1656                         *rinsep = dirty;
1657                         dtrace_membar_producer();
1658                 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1659                     dirty, NULL) != dirty);
1660         }
1661
1662         if (!work) {
1663                 /*
1664                  * We have no work to do; we can simply return.
1665                  */
1666                 return;
1667         }
1668
1669         dtrace_sync();
1670
1671         for (i = 0; i < NCPU; i++) {
1672                 dcpu = &dstate->dtds_percpu[i];
1673
1674                 if (dcpu->dtdsc_rinsing == NULL)
1675                         continue;
1676
1677                 /*
1678                  * We are now guaranteed that no hash chain contains a pointer
1679                  * into this dirty list; we can make it clean.
1680                  */
1681                 ASSERT(dcpu->dtdsc_clean == NULL);
1682                 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1683                 dcpu->dtdsc_rinsing = NULL;
1684         }
1685
1686         /*
1687          * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1688          * sure that all CPUs have seen all of the dtdsc_clean pointers.
1689          * This prevents a race whereby a CPU incorrectly decides that
1690          * the state should be something other than DTRACE_DSTATE_CLEAN
1691          * after dtrace_dynvar_clean() has completed.
1692          */
1693         dtrace_sync();
1694
1695         dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1696 }
1697
1698 /*
1699  * Depending on the value of the op parameter, this function looks-up,
1700  * allocates or deallocates an arbitrarily-keyed dynamic variable.  If an
1701  * allocation is requested, this function will return a pointer to a
1702  * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1703  * variable can be allocated.  If NULL is returned, the appropriate counter
1704  * will be incremented.
1705  */
1706 dtrace_dynvar_t *
1707 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1708     dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1709     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1710 {
1711         uint64_t hashval = DTRACE_DYNHASH_VALID;
1712         dtrace_dynhash_t *hash = dstate->dtds_hash;
1713         dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1714         processorid_t me = curcpu, cpu = me;
1715         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[me];
1716         size_t bucket, ksize;
1717         size_t chunksize = dstate->dtds_chunksize;
1718         uintptr_t kdata, lock, nstate;
1719         uint_t i;
1720
1721         ASSERT(nkeys != 0);
1722
1723         /*
1724          * Hash the key.  As with aggregations, we use Jenkins' "One-at-a-time"
1725          * algorithm.  For the by-value portions, we perform the algorithm in
1726          * 16-bit chunks (as opposed to 8-bit chunks).  This speeds things up a
1727          * bit, and seems to have only a minute effect on distribution.  For
1728          * the by-reference data, we perform "One-at-a-time" iterating (safely)
1729          * over each referenced byte.  It's painful to do this, but it's much
1730          * better than pathological hash distribution.  The efficacy of the
1731          * hashing algorithm (and a comparison with other algorithms) may be
1732          * found by running the ::dtrace_dynstat MDB dcmd.
1733          */
1734         for (i = 0; i < nkeys; i++) {
1735                 if (key[i].dttk_size == 0) {
1736                         uint64_t val = key[i].dttk_value;
1737
1738                         hashval += (val >> 48) & 0xffff;
1739                         hashval += (hashval << 10);
1740                         hashval ^= (hashval >> 6);
1741
1742                         hashval += (val >> 32) & 0xffff;
1743                         hashval += (hashval << 10);
1744                         hashval ^= (hashval >> 6);
1745
1746                         hashval += (val >> 16) & 0xffff;
1747                         hashval += (hashval << 10);
1748                         hashval ^= (hashval >> 6);
1749
1750                         hashval += val & 0xffff;
1751                         hashval += (hashval << 10);
1752                         hashval ^= (hashval >> 6);
1753                 } else {
1754                         /*
1755                          * This is incredibly painful, but it beats the hell
1756                          * out of the alternative.
1757                          */
1758                         uint64_t j, size = key[i].dttk_size;
1759                         uintptr_t base = (uintptr_t)key[i].dttk_value;
1760
1761                         if (!dtrace_canload(base, size, mstate, vstate))
1762                                 break;
1763
1764                         for (j = 0; j < size; j++) {
1765                                 hashval += dtrace_load8(base + j);
1766                                 hashval += (hashval << 10);
1767                                 hashval ^= (hashval >> 6);
1768                         }
1769                 }
1770         }
1771
1772         if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1773                 return (NULL);
1774
1775         hashval += (hashval << 3);
1776         hashval ^= (hashval >> 11);
1777         hashval += (hashval << 15);
1778
1779         /*
1780          * There is a remote chance (ideally, 1 in 2^31) that our hashval
1781          * comes out to be one of our two sentinel hash values.  If this
1782          * actually happens, we set the hashval to be a value known to be a
1783          * non-sentinel value.
1784          */
1785         if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1786                 hashval = DTRACE_DYNHASH_VALID;
1787
1788         /*
1789          * Yes, it's painful to do a divide here.  If the cycle count becomes
1790          * important here, tricks can be pulled to reduce it.  (However, it's
1791          * critical that hash collisions be kept to an absolute minimum;
1792          * they're much more painful than a divide.)  It's better to have a
1793          * solution that generates few collisions and still keeps things
1794          * relatively simple.
1795          */
1796         bucket = hashval % dstate->dtds_hashsize;
1797
1798         if (op == DTRACE_DYNVAR_DEALLOC) {
1799                 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1800
1801                 for (;;) {
1802                         while ((lock = *lockp) & 1)
1803                                 continue;
1804
1805                         if (dtrace_casptr((volatile void *)lockp,
1806                             (volatile void *)lock, (volatile void *)(lock + 1)) == (void *)lock)
1807                                 break;
1808                 }
1809
1810                 dtrace_membar_producer();
1811         }
1812
1813 top:
1814         prev = NULL;
1815         lock = hash[bucket].dtdh_lock;
1816
1817         dtrace_membar_consumer();
1818
1819         start = hash[bucket].dtdh_chain;
1820         ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1821             start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1822             op != DTRACE_DYNVAR_DEALLOC));
1823
1824         for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1825                 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1826                 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1827
1828                 if (dvar->dtdv_hashval != hashval) {
1829                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1830                                 /*
1831                                  * We've reached the sink, and therefore the
1832                                  * end of the hash chain; we can kick out of
1833                                  * the loop knowing that we have seen a valid
1834                                  * snapshot of state.
1835                                  */
1836                                 ASSERT(dvar->dtdv_next == NULL);
1837                                 ASSERT(dvar == &dtrace_dynhash_sink);
1838                                 break;
1839                         }
1840
1841                         if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
1842                                 /*
1843                                  * We've gone off the rails:  somewhere along
1844                                  * the line, one of the members of this hash
1845                                  * chain was deleted.  Note that we could also
1846                                  * detect this by simply letting this loop run
1847                                  * to completion, as we would eventually hit
1848                                  * the end of the dirty list.  However, we
1849                                  * want to avoid running the length of the
1850                                  * dirty list unnecessarily (it might be quite
1851                                  * long), so we catch this as early as
1852                                  * possible by detecting the hash marker.  In
1853                                  * this case, we simply set dvar to NULL and
1854                                  * break; the conditional after the loop will
1855                                  * send us back to top.
1856                                  */
1857                                 dvar = NULL;
1858                                 break;
1859                         }
1860
1861                         goto next;
1862                 }
1863
1864                 if (dtuple->dtt_nkeys != nkeys)
1865                         goto next;
1866
1867                 for (i = 0; i < nkeys; i++, dkey++) {
1868                         if (dkey->dttk_size != key[i].dttk_size)
1869                                 goto next; /* size or type mismatch */
1870
1871                         if (dkey->dttk_size != 0) {
1872                                 if (dtrace_bcmp(
1873                                     (void *)(uintptr_t)key[i].dttk_value,
1874                                     (void *)(uintptr_t)dkey->dttk_value,
1875                                     dkey->dttk_size))
1876                                         goto next;
1877                         } else {
1878                                 if (dkey->dttk_value != key[i].dttk_value)
1879                                         goto next;
1880                         }
1881                 }
1882
1883                 if (op != DTRACE_DYNVAR_DEALLOC)
1884                         return (dvar);
1885
1886                 ASSERT(dvar->dtdv_next == NULL ||
1887                     dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
1888
1889                 if (prev != NULL) {
1890                         ASSERT(hash[bucket].dtdh_chain != dvar);
1891                         ASSERT(start != dvar);
1892                         ASSERT(prev->dtdv_next == dvar);
1893                         prev->dtdv_next = dvar->dtdv_next;
1894                 } else {
1895                         if (dtrace_casptr(&hash[bucket].dtdh_chain,
1896                             start, dvar->dtdv_next) != start) {
1897                                 /*
1898                                  * We have failed to atomically swing the
1899                                  * hash table head pointer, presumably because
1900                                  * of a conflicting allocation on another CPU.
1901                                  * We need to reread the hash chain and try
1902                                  * again.
1903                                  */
1904                                 goto top;
1905                         }
1906                 }
1907
1908                 dtrace_membar_producer();
1909
1910                 /*
1911                  * Now set the hash value to indicate that it's free.
1912                  */
1913                 ASSERT(hash[bucket].dtdh_chain != dvar);
1914                 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
1915
1916                 dtrace_membar_producer();
1917
1918                 /*
1919                  * Set the next pointer to point at the dirty list, and
1920                  * atomically swing the dirty pointer to the newly freed dvar.
1921                  */
1922                 do {
1923                         next = dcpu->dtdsc_dirty;
1924                         dvar->dtdv_next = next;
1925                 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
1926
1927                 /*
1928                  * Finally, unlock this hash bucket.
1929                  */
1930                 ASSERT(hash[bucket].dtdh_lock == lock);
1931                 ASSERT(lock & 1);
1932                 hash[bucket].dtdh_lock++;
1933
1934                 return (NULL);
1935 next:
1936                 prev = dvar;
1937                 continue;
1938         }
1939
1940         if (dvar == NULL) {
1941                 /*
1942                  * If dvar is NULL, it is because we went off the rails:
1943                  * one of the elements that we traversed in the hash chain
1944                  * was deleted while we were traversing it.  In this case,
1945                  * we assert that we aren't doing a dealloc (deallocs lock
1946                  * the hash bucket to prevent themselves from racing with
1947                  * one another), and retry the hash chain traversal.
1948                  */
1949                 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
1950                 goto top;
1951         }
1952
1953         if (op != DTRACE_DYNVAR_ALLOC) {
1954                 /*
1955                  * If we are not to allocate a new variable, we want to
1956                  * return NULL now.  Before we return, check that the value
1957                  * of the lock word hasn't changed.  If it has, we may have
1958                  * seen an inconsistent snapshot.
1959                  */
1960                 if (op == DTRACE_DYNVAR_NOALLOC) {
1961                         if (hash[bucket].dtdh_lock != lock)
1962                                 goto top;
1963                 } else {
1964                         ASSERT(op == DTRACE_DYNVAR_DEALLOC);
1965                         ASSERT(hash[bucket].dtdh_lock == lock);
1966                         ASSERT(lock & 1);
1967                         hash[bucket].dtdh_lock++;
1968                 }
1969
1970                 return (NULL);
1971         }
1972
1973         /*
1974          * We need to allocate a new dynamic variable.  The size we need is the
1975          * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
1976          * size of any auxiliary key data (rounded up to 8-byte alignment) plus
1977          * the size of any referred-to data (dsize).  We then round the final
1978          * size up to the chunksize for allocation.
1979          */
1980         for (ksize = 0, i = 0; i < nkeys; i++)
1981                 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
1982
1983         /*
1984          * This should be pretty much impossible, but could happen if, say,
1985          * strange DIF specified the tuple.  Ideally, this should be an
1986          * assertion and not an error condition -- but that requires that the
1987          * chunksize calculation in dtrace_difo_chunksize() be absolutely
1988          * bullet-proof.  (That is, it must not be able to be fooled by
1989          * malicious DIF.)  Given the lack of backwards branches in DIF,
1990          * solving this would presumably not amount to solving the Halting
1991          * Problem -- but it still seems awfully hard.
1992          */
1993         if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
1994             ksize + dsize > chunksize) {
1995                 dcpu->dtdsc_drops++;
1996                 return (NULL);
1997         }
1998
1999         nstate = DTRACE_DSTATE_EMPTY;
2000
2001         do {
2002 retry:
2003                 free = dcpu->dtdsc_free;
2004
2005                 if (free == NULL) {
2006                         dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2007                         void *rval;
2008
2009                         if (clean == NULL) {
2010                                 /*
2011                                  * We're out of dynamic variable space on
2012                                  * this CPU.  Unless we have tried all CPUs,
2013                                  * we'll try to allocate from a different
2014                                  * CPU.
2015                                  */
2016                                 switch (dstate->dtds_state) {
2017                                 case DTRACE_DSTATE_CLEAN: {
2018                                         void *sp = &dstate->dtds_state;
2019
2020                                         if (++cpu >= NCPU)
2021                                                 cpu = 0;
2022
2023                                         if (dcpu->dtdsc_dirty != NULL &&
2024                                             nstate == DTRACE_DSTATE_EMPTY)
2025                                                 nstate = DTRACE_DSTATE_DIRTY;
2026
2027                                         if (dcpu->dtdsc_rinsing != NULL)
2028                                                 nstate = DTRACE_DSTATE_RINSING;
2029
2030                                         dcpu = &dstate->dtds_percpu[cpu];
2031
2032                                         if (cpu != me)
2033                                                 goto retry;
2034
2035                                         (void) dtrace_cas32(sp,
2036                                             DTRACE_DSTATE_CLEAN, nstate);
2037
2038                                         /*
2039                                          * To increment the correct bean
2040                                          * counter, take another lap.
2041                                          */
2042                                         goto retry;
2043                                 }
2044
2045                                 case DTRACE_DSTATE_DIRTY:
2046                                         dcpu->dtdsc_dirty_drops++;
2047                                         break;
2048
2049                                 case DTRACE_DSTATE_RINSING:
2050                                         dcpu->dtdsc_rinsing_drops++;
2051                                         break;
2052
2053                                 case DTRACE_DSTATE_EMPTY:
2054                                         dcpu->dtdsc_drops++;
2055                                         break;
2056                                 }
2057
2058                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2059                                 return (NULL);
2060                         }
2061
2062                         /*
2063                          * The clean list appears to be non-empty.  We want to
2064                          * move the clean list to the free list; we start by
2065                          * moving the clean pointer aside.
2066                          */
2067                         if (dtrace_casptr(&dcpu->dtdsc_clean,
2068                             clean, NULL) != clean) {
2069                                 /*
2070                                  * We are in one of two situations:
2071                                  *
2072                                  *  (a) The clean list was switched to the
2073                                  *      free list by another CPU.
2074                                  *
2075                                  *  (b) The clean list was added to by the
2076                                  *      cleansing cyclic.
2077                                  *
2078                                  * In either of these situations, we can
2079                                  * just reattempt the free list allocation.
2080                                  */
2081                                 goto retry;
2082                         }
2083
2084                         ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2085
2086                         /*
2087                          * Now we'll move the clean list to our free list.
2088                          * It's impossible for this to fail:  the only way
2089                          * the free list can be updated is through this
2090                          * code path, and only one CPU can own the clean list.
2091                          * Thus, it would only be possible for this to fail if
2092                          * this code were racing with dtrace_dynvar_clean().
2093                          * (That is, if dtrace_dynvar_clean() updated the clean
2094                          * list, and we ended up racing to update the free
2095                          * list.)  This race is prevented by the dtrace_sync()
2096                          * in dtrace_dynvar_clean() -- which flushes the
2097                          * owners of the clean lists out before resetting
2098                          * the clean lists.
2099                          */
2100                         dcpu = &dstate->dtds_percpu[me];
2101                         rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2102                         ASSERT(rval == NULL);
2103                         goto retry;
2104                 }
2105
2106                 dvar = free;
2107                 new_free = dvar->dtdv_next;
2108         } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2109
2110         /*
2111          * We have now allocated a new chunk.  We copy the tuple keys into the
2112          * tuple array and copy any referenced key data into the data space
2113          * following the tuple array.  As we do this, we relocate dttk_value
2114          * in the final tuple to point to the key data address in the chunk.
2115          */
2116         kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2117         dvar->dtdv_data = (void *)(kdata + ksize);
2118         dvar->dtdv_tuple.dtt_nkeys = nkeys;
2119
2120         for (i = 0; i < nkeys; i++) {
2121                 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2122                 size_t kesize = key[i].dttk_size;
2123
2124                 if (kesize != 0) {
2125                         dtrace_bcopy(
2126                             (const void *)(uintptr_t)key[i].dttk_value,
2127                             (void *)kdata, kesize);
2128                         dkey->dttk_value = kdata;
2129                         kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2130                 } else {
2131                         dkey->dttk_value = key[i].dttk_value;
2132                 }
2133
2134                 dkey->dttk_size = kesize;
2135         }
2136
2137         ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2138         dvar->dtdv_hashval = hashval;
2139         dvar->dtdv_next = start;
2140
2141         if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2142                 return (dvar);
2143
2144         /*
2145          * The cas has failed.  Either another CPU is adding an element to
2146          * this hash chain, or another CPU is deleting an element from this
2147          * hash chain.  The simplest way to deal with both of these cases
2148          * (though not necessarily the most efficient) is to free our
2149          * allocated block and tail-call ourselves.  Note that the free is
2150          * to the dirty list and _not_ to the free list.  This is to prevent
2151          * races with allocators, above.
2152          */
2153         dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2154
2155         dtrace_membar_producer();
2156
2157         do {
2158                 free = dcpu->dtdsc_dirty;
2159                 dvar->dtdv_next = free;
2160         } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2161
2162         return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2163 }
2164
2165 /*ARGSUSED*/
2166 static void
2167 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2168 {
2169         if ((int64_t)nval < (int64_t)*oval)
2170                 *oval = nval;
2171 }
2172
2173 /*ARGSUSED*/
2174 static void
2175 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2176 {
2177         if ((int64_t)nval > (int64_t)*oval)
2178                 *oval = nval;
2179 }
2180
2181 static void
2182 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2183 {
2184         int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2185         int64_t val = (int64_t)nval;
2186
2187         if (val < 0) {
2188                 for (i = 0; i < zero; i++) {
2189                         if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2190                                 quanta[i] += incr;
2191                                 return;
2192                         }
2193                 }
2194         } else {
2195                 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2196                         if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2197                                 quanta[i - 1] += incr;
2198                                 return;
2199                         }
2200                 }
2201
2202                 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2203                 return;
2204         }
2205
2206         ASSERT(0);
2207 }
2208
2209 static void
2210 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2211 {
2212         uint64_t arg = *lquanta++;
2213         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2214         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2215         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2216         int32_t val = (int32_t)nval, level;
2217
2218         ASSERT(step != 0);
2219         ASSERT(levels != 0);
2220
2221         if (val < base) {
2222                 /*
2223                  * This is an underflow.
2224                  */
2225                 lquanta[0] += incr;
2226                 return;
2227         }
2228
2229         level = (val - base) / step;
2230
2231         if (level < levels) {
2232                 lquanta[level + 1] += incr;
2233                 return;
2234         }
2235
2236         /*
2237          * This is an overflow.
2238          */
2239         lquanta[levels + 1] += incr;
2240 }
2241
2242 static int
2243 dtrace_aggregate_llquantize_bucket(uint16_t factor, uint16_t low,
2244     uint16_t high, uint16_t nsteps, int64_t value)
2245 {
2246         int64_t this = 1, last, next;
2247         int base = 1, order;
2248
2249         ASSERT(factor <= nsteps);
2250         ASSERT(nsteps % factor == 0);
2251
2252         for (order = 0; order < low; order++)
2253                 this *= factor;
2254
2255         /*
2256          * If our value is less than our factor taken to the power of the
2257          * low order of magnitude, it goes into the zeroth bucket.
2258          */
2259         if (value < (last = this))
2260                 return (0);
2261
2262         for (this *= factor; order <= high; order++) {
2263                 int nbuckets = this > nsteps ? nsteps : this;
2264
2265                 if ((next = this * factor) < this) {
2266                         /*
2267                          * We should not generally get log/linear quantizations
2268                          * with a high magnitude that allows 64-bits to
2269                          * overflow, but we nonetheless protect against this
2270                          * by explicitly checking for overflow, and clamping
2271                          * our value accordingly.
2272                          */
2273                         value = this - 1;
2274                 }
2275
2276                 if (value < this) {
2277                         /*
2278                          * If our value lies within this order of magnitude,
2279                          * determine its position by taking the offset within
2280                          * the order of magnitude, dividing by the bucket
2281                          * width, and adding to our (accumulated) base.
2282                          */
2283                         return (base + (value - last) / (this / nbuckets));
2284                 }
2285
2286                 base += nbuckets - (nbuckets / factor);
2287                 last = this;
2288                 this = next;
2289         }
2290
2291         /*
2292          * Our value is greater than or equal to our factor taken to the
2293          * power of one plus the high magnitude -- return the top bucket.
2294          */
2295         return (base);
2296 }
2297
2298 static void
2299 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2300 {
2301         uint64_t arg = *llquanta++;
2302         uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2303         uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2304         uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2305         uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2306
2307         llquanta[dtrace_aggregate_llquantize_bucket(factor,
2308             low, high, nsteps, nval)] += incr;
2309 }
2310
2311 /*ARGSUSED*/
2312 static void
2313 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2314 {
2315         data[0]++;
2316         data[1] += nval;
2317 }
2318
2319 /*ARGSUSED*/
2320 static void
2321 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2322 {
2323         int64_t snval = (int64_t)nval;
2324         uint64_t tmp[2];
2325
2326         data[0]++;
2327         data[1] += nval;
2328
2329         /*
2330          * What we want to say here is:
2331          *
2332          * data[2] += nval * nval;
2333          *
2334          * But given that nval is 64-bit, we could easily overflow, so
2335          * we do this as 128-bit arithmetic.
2336          */
2337         if (snval < 0)
2338                 snval = -snval;
2339
2340         dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2341         dtrace_add_128(data + 2, tmp, data + 2);
2342 }
2343
2344 /*ARGSUSED*/
2345 static void
2346 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2347 {
2348         *oval = *oval + 1;
2349 }
2350
2351 /*ARGSUSED*/
2352 static void
2353 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2354 {
2355         *oval += nval;
2356 }
2357
2358 /*
2359  * Aggregate given the tuple in the principal data buffer, and the aggregating
2360  * action denoted by the specified dtrace_aggregation_t.  The aggregation
2361  * buffer is specified as the buf parameter.  This routine does not return
2362  * failure; if there is no space in the aggregation buffer, the data will be
2363  * dropped, and a corresponding counter incremented.
2364  */
2365 static void
2366 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2367     intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2368 {
2369         dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2370         uint32_t i, ndx, size, fsize;
2371         uint32_t align = sizeof (uint64_t) - 1;
2372         dtrace_aggbuffer_t *agb;
2373         dtrace_aggkey_t *key;
2374         uint32_t hashval = 0, limit, isstr;
2375         caddr_t tomax, data, kdata;
2376         dtrace_actkind_t action;
2377         dtrace_action_t *act;
2378         uintptr_t offs;
2379
2380         if (buf == NULL)
2381                 return;
2382
2383         if (!agg->dtag_hasarg) {
2384                 /*
2385                  * Currently, only quantize() and lquantize() take additional
2386                  * arguments, and they have the same semantics:  an increment
2387                  * value that defaults to 1 when not present.  If additional
2388                  * aggregating actions take arguments, the setting of the
2389                  * default argument value will presumably have to become more
2390                  * sophisticated...
2391                  */
2392                 arg = 1;
2393         }
2394
2395         action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2396         size = rec->dtrd_offset - agg->dtag_base;
2397         fsize = size + rec->dtrd_size;
2398
2399         ASSERT(dbuf->dtb_tomax != NULL);
2400         data = dbuf->dtb_tomax + offset + agg->dtag_base;
2401
2402         if ((tomax = buf->dtb_tomax) == NULL) {
2403                 dtrace_buffer_drop(buf);
2404                 return;
2405         }
2406
2407         /*
2408          * The metastructure is always at the bottom of the buffer.
2409          */
2410         agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2411             sizeof (dtrace_aggbuffer_t));
2412
2413         if (buf->dtb_offset == 0) {
2414                 /*
2415                  * We just kludge up approximately 1/8th of the size to be
2416                  * buckets.  If this guess ends up being routinely
2417                  * off-the-mark, we may need to dynamically readjust this
2418                  * based on past performance.
2419                  */
2420                 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2421
2422                 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2423                     (uintptr_t)tomax || hashsize == 0) {
2424                         /*
2425                          * We've been given a ludicrously small buffer;
2426                          * increment our drop count and leave.
2427                          */
2428                         dtrace_buffer_drop(buf);
2429                         return;
2430                 }
2431
2432                 /*
2433                  * And now, a pathetic attempt to try to get a an odd (or
2434                  * perchance, a prime) hash size for better hash distribution.
2435                  */
2436                 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2437                         hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2438
2439                 agb->dtagb_hashsize = hashsize;
2440                 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2441                     agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2442                 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2443
2444                 for (i = 0; i < agb->dtagb_hashsize; i++)
2445                         agb->dtagb_hash[i] = NULL;
2446         }
2447
2448         ASSERT(agg->dtag_first != NULL);
2449         ASSERT(agg->dtag_first->dta_intuple);
2450
2451         /*
2452          * Calculate the hash value based on the key.  Note that we _don't_
2453          * include the aggid in the hashing (but we will store it as part of
2454          * the key).  The hashing algorithm is Bob Jenkins' "One-at-a-time"
2455          * algorithm: a simple, quick algorithm that has no known funnels, and
2456          * gets good distribution in practice.  The efficacy of the hashing
2457          * algorithm (and a comparison with other algorithms) may be found by
2458          * running the ::dtrace_aggstat MDB dcmd.
2459          */
2460         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2461                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2462                 limit = i + act->dta_rec.dtrd_size;
2463                 ASSERT(limit <= size);
2464                 isstr = DTRACEACT_ISSTRING(act);
2465
2466                 for (; i < limit; i++) {
2467                         hashval += data[i];
2468                         hashval += (hashval << 10);
2469                         hashval ^= (hashval >> 6);
2470
2471                         if (isstr && data[i] == '\0')
2472                                 break;
2473                 }
2474         }
2475
2476         hashval += (hashval << 3);
2477         hashval ^= (hashval >> 11);
2478         hashval += (hashval << 15);
2479
2480         /*
2481          * Yes, the divide here is expensive -- but it's generally the least
2482          * of the performance issues given the amount of data that we iterate
2483          * over to compute hash values, compare data, etc.
2484          */
2485         ndx = hashval % agb->dtagb_hashsize;
2486
2487         for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2488                 ASSERT((caddr_t)key >= tomax);
2489                 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2490
2491                 if (hashval != key->dtak_hashval || key->dtak_size != size)
2492                         continue;
2493
2494                 kdata = key->dtak_data;
2495                 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2496
2497                 for (act = agg->dtag_first; act->dta_intuple;
2498                     act = act->dta_next) {
2499                         i = act->dta_rec.dtrd_offset - agg->dtag_base;
2500                         limit = i + act->dta_rec.dtrd_size;
2501                         ASSERT(limit <= size);
2502                         isstr = DTRACEACT_ISSTRING(act);
2503
2504                         for (; i < limit; i++) {
2505                                 if (kdata[i] != data[i])
2506                                         goto next;
2507
2508                                 if (isstr && data[i] == '\0')
2509                                         break;
2510                         }
2511                 }
2512
2513                 if (action != key->dtak_action) {
2514                         /*
2515                          * We are aggregating on the same value in the same
2516                          * aggregation with two different aggregating actions.
2517                          * (This should have been picked up in the compiler,
2518                          * so we may be dealing with errant or devious DIF.)
2519                          * This is an error condition; we indicate as much,
2520                          * and return.
2521                          */
2522                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2523                         return;
2524                 }
2525
2526                 /*
2527                  * This is a hit:  we need to apply the aggregator to
2528                  * the value at this key.
2529                  */
2530                 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2531                 return;
2532 next:
2533                 continue;
2534         }
2535
2536         /*
2537          * We didn't find it.  We need to allocate some zero-filled space,
2538          * link it into the hash table appropriately, and apply the aggregator
2539          * to the (zero-filled) value.
2540          */
2541         offs = buf->dtb_offset;
2542         while (offs & (align - 1))
2543                 offs += sizeof (uint32_t);
2544
2545         /*
2546          * If we don't have enough room to both allocate a new key _and_
2547          * its associated data, increment the drop count and return.
2548          */
2549         if ((uintptr_t)tomax + offs + fsize >
2550             agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2551                 dtrace_buffer_drop(buf);
2552                 return;
2553         }
2554
2555         /*CONSTCOND*/
2556         ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2557         key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2558         agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2559
2560         key->dtak_data = kdata = tomax + offs;
2561         buf->dtb_offset = offs + fsize;
2562
2563         /*
2564          * Now copy the data across.
2565          */
2566         *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2567
2568         for (i = sizeof (dtrace_aggid_t); i < size; i++)
2569                 kdata[i] = data[i];
2570
2571         /*
2572          * Because strings are not zeroed out by default, we need to iterate
2573          * looking for actions that store strings, and we need to explicitly
2574          * pad these strings out with zeroes.
2575          */
2576         for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2577                 int nul;
2578
2579                 if (!DTRACEACT_ISSTRING(act))
2580                         continue;
2581
2582                 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2583                 limit = i + act->dta_rec.dtrd_size;
2584                 ASSERT(limit <= size);
2585
2586                 for (nul = 0; i < limit; i++) {
2587                         if (nul) {
2588                                 kdata[i] = '\0';
2589                                 continue;
2590                         }
2591
2592                         if (data[i] != '\0')
2593                                 continue;
2594
2595                         nul = 1;
2596                 }
2597         }
2598
2599         for (i = size; i < fsize; i++)
2600                 kdata[i] = 0;
2601
2602         key->dtak_hashval = hashval;
2603         key->dtak_size = size;
2604         key->dtak_action = action;
2605         key->dtak_next = agb->dtagb_hash[ndx];
2606         agb->dtagb_hash[ndx] = key;
2607
2608         /*
2609          * Finally, apply the aggregator.
2610          */
2611         *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2612         agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2613 }
2614
2615 /*
2616  * Given consumer state, this routine finds a speculation in the INACTIVE
2617  * state and transitions it into the ACTIVE state.  If there is no speculation
2618  * in the INACTIVE state, 0 is returned.  In this case, no error counter is
2619  * incremented -- it is up to the caller to take appropriate action.
2620  */
2621 static int
2622 dtrace_speculation(dtrace_state_t *state)
2623 {
2624         int i = 0;
2625         dtrace_speculation_state_t current;
2626         uint32_t *stat = &state->dts_speculations_unavail, count;
2627
2628         while (i < state->dts_nspeculations) {
2629                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2630
2631                 current = spec->dtsp_state;
2632
2633                 if (current != DTRACESPEC_INACTIVE) {
2634                         if (current == DTRACESPEC_COMMITTINGMANY ||
2635                             current == DTRACESPEC_COMMITTING ||
2636                             current == DTRACESPEC_DISCARDING)
2637                                 stat = &state->dts_speculations_busy;
2638                         i++;
2639                         continue;
2640                 }
2641
2642                 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2643                     current, DTRACESPEC_ACTIVE) == current)
2644                         return (i + 1);
2645         }
2646
2647         /*
2648          * We couldn't find a speculation.  If we found as much as a single
2649          * busy speculation buffer, we'll attribute this failure as "busy"
2650          * instead of "unavail".
2651          */
2652         do {
2653                 count = *stat;
2654         } while (dtrace_cas32(stat, count, count + 1) != count);
2655
2656         return (0);
2657 }
2658
2659 /*
2660  * This routine commits an active speculation.  If the specified speculation
2661  * is not in a valid state to perform a commit(), this routine will silently do
2662  * nothing.  The state of the specified speculation is transitioned according
2663  * to the state transition diagram outlined in <sys/dtrace_impl.h>
2664  */
2665 static void
2666 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2667     dtrace_specid_t which)
2668 {
2669         dtrace_speculation_t *spec;
2670         dtrace_buffer_t *src, *dest;
2671         uintptr_t daddr, saddr, dlimit, slimit;
2672         dtrace_speculation_state_t current, new = 0;
2673         intptr_t offs;
2674         uint64_t timestamp;
2675
2676         if (which == 0)
2677                 return;
2678
2679         if (which > state->dts_nspeculations) {
2680                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2681                 return;
2682         }
2683
2684         spec = &state->dts_speculations[which - 1];
2685         src = &spec->dtsp_buffer[cpu];
2686         dest = &state->dts_buffer[cpu];
2687
2688         do {
2689                 current = spec->dtsp_state;
2690
2691                 if (current == DTRACESPEC_COMMITTINGMANY)
2692                         break;
2693
2694                 switch (current) {
2695                 case DTRACESPEC_INACTIVE:
2696                 case DTRACESPEC_DISCARDING:
2697                         return;
2698
2699                 case DTRACESPEC_COMMITTING:
2700                         /*
2701                          * This is only possible if we are (a) commit()'ing
2702                          * without having done a prior speculate() on this CPU
2703                          * and (b) racing with another commit() on a different
2704                          * CPU.  There's nothing to do -- we just assert that
2705                          * our offset is 0.
2706                          */
2707                         ASSERT(src->dtb_offset == 0);
2708                         return;
2709
2710                 case DTRACESPEC_ACTIVE:
2711                         new = DTRACESPEC_COMMITTING;
2712                         break;
2713
2714                 case DTRACESPEC_ACTIVEONE:
2715                         /*
2716                          * This speculation is active on one CPU.  If our
2717                          * buffer offset is non-zero, we know that the one CPU
2718                          * must be us.  Otherwise, we are committing on a
2719                          * different CPU from the speculate(), and we must
2720                          * rely on being asynchronously cleaned.
2721                          */
2722                         if (src->dtb_offset != 0) {
2723                                 new = DTRACESPEC_COMMITTING;
2724                                 break;
2725                         }
2726                         /*FALLTHROUGH*/
2727
2728                 case DTRACESPEC_ACTIVEMANY:
2729                         new = DTRACESPEC_COMMITTINGMANY;
2730                         break;
2731
2732                 default:
2733                         ASSERT(0);
2734                 }
2735         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2736             current, new) != current);
2737
2738         /*
2739          * We have set the state to indicate that we are committing this
2740          * speculation.  Now reserve the necessary space in the destination
2741          * buffer.
2742          */
2743         if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2744             sizeof (uint64_t), state, NULL)) < 0) {
2745                 dtrace_buffer_drop(dest);
2746                 goto out;
2747         }
2748
2749         /*
2750          * We have sufficient space to copy the speculative buffer into the
2751          * primary buffer.  First, modify the speculative buffer, filling
2752          * in the timestamp of all entries with the current time.  The data
2753          * must have the commit() time rather than the time it was traced,
2754          * so that all entries in the primary buffer are in timestamp order.
2755          */
2756         timestamp = dtrace_gethrtime();
2757         saddr = (uintptr_t)src->dtb_tomax;
2758         slimit = saddr + src->dtb_offset;
2759         while (saddr < slimit) {
2760                 size_t size;
2761                 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2762
2763                 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2764                         saddr += sizeof (dtrace_epid_t);
2765                         continue;
2766                 }
2767                 ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
2768                 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2769
2770                 ASSERT3U(saddr + size, <=, slimit);
2771                 ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
2772                 ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
2773
2774                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2775
2776                 saddr += size;
2777         }
2778
2779         /*
2780          * Copy the buffer across.  (Note that this is a
2781          * highly subobtimal bcopy(); in the unlikely event that this becomes
2782          * a serious performance issue, a high-performance DTrace-specific
2783          * bcopy() should obviously be invented.)
2784          */
2785         daddr = (uintptr_t)dest->dtb_tomax + offs;
2786         dlimit = daddr + src->dtb_offset;
2787         saddr = (uintptr_t)src->dtb_tomax;
2788
2789         /*
2790          * First, the aligned portion.
2791          */
2792         while (dlimit - daddr >= sizeof (uint64_t)) {
2793                 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2794
2795                 daddr += sizeof (uint64_t);
2796                 saddr += sizeof (uint64_t);
2797         }
2798
2799         /*
2800          * Now any left-over bit...
2801          */
2802         while (dlimit - daddr)
2803                 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2804
2805         /*
2806          * Finally, commit the reserved space in the destination buffer.
2807          */
2808         dest->dtb_offset = offs + src->dtb_offset;
2809
2810 out:
2811         /*
2812          * If we're lucky enough to be the only active CPU on this speculation
2813          * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2814          */
2815         if (current == DTRACESPEC_ACTIVE ||
2816             (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2817                 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2818                     DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2819
2820                 ASSERT(rval == DTRACESPEC_COMMITTING);
2821         }
2822
2823         src->dtb_offset = 0;
2824         src->dtb_xamot_drops += src->dtb_drops;
2825         src->dtb_drops = 0;
2826 }
2827
2828 /*
2829  * This routine discards an active speculation.  If the specified speculation
2830  * is not in a valid state to perform a discard(), this routine will silently
2831  * do nothing.  The state of the specified speculation is transitioned
2832  * according to the state transition diagram outlined in <sys/dtrace_impl.h>
2833  */
2834 static void
2835 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
2836     dtrace_specid_t which)
2837 {
2838         dtrace_speculation_t *spec;
2839         dtrace_speculation_state_t current, new = 0;
2840         dtrace_buffer_t *buf;
2841
2842         if (which == 0)
2843                 return;
2844
2845         if (which > state->dts_nspeculations) {
2846                 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2847                 return;
2848         }
2849
2850         spec = &state->dts_speculations[which - 1];
2851         buf = &spec->dtsp_buffer[cpu];
2852
2853         do {
2854                 current = spec->dtsp_state;
2855
2856                 switch (current) {
2857                 case DTRACESPEC_INACTIVE:
2858                 case DTRACESPEC_COMMITTINGMANY:
2859                 case DTRACESPEC_COMMITTING:
2860                 case DTRACESPEC_DISCARDING:
2861                         return;
2862
2863                 case DTRACESPEC_ACTIVE:
2864                 case DTRACESPEC_ACTIVEMANY:
2865                         new = DTRACESPEC_DISCARDING;
2866                         break;
2867
2868                 case DTRACESPEC_ACTIVEONE:
2869                         if (buf->dtb_offset != 0) {
2870                                 new = DTRACESPEC_INACTIVE;
2871                         } else {
2872                                 new = DTRACESPEC_DISCARDING;
2873                         }
2874                         break;
2875
2876                 default:
2877                         ASSERT(0);
2878                 }
2879         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2880             current, new) != current);
2881
2882         buf->dtb_offset = 0;
2883         buf->dtb_drops = 0;
2884 }
2885
2886 /*
2887  * Note:  not called from probe context.  This function is called
2888  * asynchronously from cross call context to clean any speculations that are
2889  * in the COMMITTINGMANY or DISCARDING states.  These speculations may not be
2890  * transitioned back to the INACTIVE state until all CPUs have cleaned the
2891  * speculation.
2892  */
2893 static void
2894 dtrace_speculation_clean_here(dtrace_state_t *state)
2895 {
2896         dtrace_icookie_t cookie;
2897         processorid_t cpu = curcpu;
2898         dtrace_buffer_t *dest = &state->dts_buffer[cpu];
2899         dtrace_specid_t i;
2900
2901         cookie = dtrace_interrupt_disable();
2902
2903         if (dest->dtb_tomax == NULL) {
2904                 dtrace_interrupt_enable(cookie);
2905                 return;
2906         }
2907
2908         for (i = 0; i < state->dts_nspeculations; i++) {
2909                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2910                 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
2911
2912                 if (src->dtb_tomax == NULL)
2913                         continue;
2914
2915                 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
2916                         src->dtb_offset = 0;
2917                         continue;
2918                 }
2919
2920                 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2921                         continue;
2922
2923                 if (src->dtb_offset == 0)
2924                         continue;
2925
2926                 dtrace_speculation_commit(state, cpu, i + 1);
2927         }
2928
2929         dtrace_interrupt_enable(cookie);
2930 }
2931
2932 /*
2933  * Note:  not called from probe context.  This function is called
2934  * asynchronously (and at a regular interval) to clean any speculations that
2935  * are in the COMMITTINGMANY or DISCARDING states.  If it discovers that there
2936  * is work to be done, it cross calls all CPUs to perform that work;
2937  * COMMITMANY and DISCARDING speculations may not be transitioned back to the
2938  * INACTIVE state until they have been cleaned by all CPUs.
2939  */
2940 static void
2941 dtrace_speculation_clean(dtrace_state_t *state)
2942 {
2943         int work = 0, rv;
2944         dtrace_specid_t i;
2945
2946         for (i = 0; i < state->dts_nspeculations; i++) {
2947                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2948
2949                 ASSERT(!spec->dtsp_cleaning);
2950
2951                 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
2952                     spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
2953                         continue;
2954
2955                 work++;
2956                 spec->dtsp_cleaning = 1;
2957         }
2958
2959         if (!work)
2960                 return;
2961
2962         dtrace_xcall(DTRACE_CPUALL,
2963             (dtrace_xcall_t)dtrace_speculation_clean_here, state);
2964
2965         /*
2966          * We now know that all CPUs have committed or discarded their
2967          * speculation buffers, as appropriate.  We can now set the state
2968          * to inactive.
2969          */
2970         for (i = 0; i < state->dts_nspeculations; i++) {
2971                 dtrace_speculation_t *spec = &state->dts_speculations[i];
2972                 dtrace_speculation_state_t current, new;
2973
2974                 if (!spec->dtsp_cleaning)
2975                         continue;
2976
2977                 current = spec->dtsp_state;
2978                 ASSERT(current == DTRACESPEC_DISCARDING ||
2979                     current == DTRACESPEC_COMMITTINGMANY);
2980
2981                 new = DTRACESPEC_INACTIVE;
2982
2983                 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
2984                 ASSERT(rv == current);
2985                 spec->dtsp_cleaning = 0;
2986         }
2987 }
2988
2989 /*
2990  * Called as part of a speculate() to get the speculative buffer associated
2991  * with a given speculation.  Returns NULL if the specified speculation is not
2992  * in an ACTIVE state.  If the speculation is in the ACTIVEONE state -- and
2993  * the active CPU is not the specified CPU -- the speculation will be
2994  * atomically transitioned into the ACTIVEMANY state.
2995  */
2996 static dtrace_buffer_t *
2997 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
2998     dtrace_specid_t which)
2999 {
3000         dtrace_speculation_t *spec;
3001         dtrace_speculation_state_t current, new = 0;
3002         dtrace_buffer_t *buf;
3003
3004         if (which == 0)
3005                 return (NULL);
3006
3007         if (which > state->dts_nspeculations) {
3008                 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3009                 return (NULL);
3010         }
3011
3012         spec = &state->dts_speculations[which - 1];
3013         buf = &spec->dtsp_buffer[cpuid];
3014
3015         do {
3016                 current = spec->dtsp_state;
3017
3018                 switch (current) {
3019                 case DTRACESPEC_INACTIVE:
3020                 case DTRACESPEC_COMMITTINGMANY:
3021                 case DTRACESPEC_DISCARDING:
3022                         return (NULL);
3023
3024                 case DTRACESPEC_COMMITTING:
3025                         ASSERT(buf->dtb_offset == 0);
3026                         return (NULL);
3027
3028                 case DTRACESPEC_ACTIVEONE:
3029                         /*
3030                          * This speculation is currently active on one CPU.
3031                          * Check the offset in the buffer; if it's non-zero,
3032                          * that CPU must be us (and we leave the state alone).
3033                          * If it's zero, assume that we're starting on a new
3034                          * CPU -- and change the state to indicate that the
3035                          * speculation is active on more than one CPU.
3036                          */
3037                         if (buf->dtb_offset != 0)
3038                                 return (buf);
3039
3040                         new = DTRACESPEC_ACTIVEMANY;
3041                         break;
3042
3043                 case DTRACESPEC_ACTIVEMANY:
3044                         return (buf);
3045
3046                 case DTRACESPEC_ACTIVE:
3047                         new = DTRACESPEC_ACTIVEONE;
3048                         break;
3049
3050                 default:
3051                         ASSERT(0);
3052                 }
3053         } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3054             current, new) != current);
3055
3056         ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3057         return (buf);
3058 }
3059
3060 /*
3061  * Return a string.  In the event that the user lacks the privilege to access
3062  * arbitrary kernel memory, we copy the string out to scratch memory so that we
3063  * don't fail access checking.
3064  *
3065  * dtrace_dif_variable() uses this routine as a helper for various
3066  * builtin values such as 'execname' and 'probefunc.'
3067  */
3068 uintptr_t
3069 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3070     dtrace_mstate_t *mstate)
3071 {
3072         uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3073         uintptr_t ret;
3074         size_t strsz;
3075
3076         /*
3077          * The easy case: this probe is allowed to read all of memory, so
3078          * we can just return this as a vanilla pointer.
3079          */
3080         if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3081                 return (addr);
3082
3083         /*
3084          * This is the tougher case: we copy the string in question from
3085          * kernel memory into scratch memory and return it that way: this
3086          * ensures that we won't trip up when access checking tests the
3087          * BYREF return value.
3088          */
3089         strsz = dtrace_strlen((char *)addr, size) + 1;
3090
3091         if (mstate->dtms_scratch_ptr + strsz >
3092             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3093                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3094                 return (0);
3095         }
3096
3097         dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3098             strsz);
3099         ret = mstate->dtms_scratch_ptr;
3100         mstate->dtms_scratch_ptr += strsz;
3101         return (ret);
3102 }
3103
3104 /*
3105  * Return a string from a memoy address which is known to have one or
3106  * more concatenated, individually zero terminated, sub-strings.
3107  * In the event that the user lacks the privilege to access
3108  * arbitrary kernel memory, we copy the string out to scratch memory so that we
3109  * don't fail access checking.
3110  *
3111  * dtrace_dif_variable() uses this routine as a helper for various
3112  * builtin values such as 'execargs'.
3113  */
3114 static uintptr_t
3115 dtrace_dif_varstrz(uintptr_t addr, size_t strsz, dtrace_state_t *state,
3116     dtrace_mstate_t *mstate)
3117 {
3118         char *p;
3119         size_t i;
3120         uintptr_t ret;
3121
3122         if (mstate->dtms_scratch_ptr + strsz >
3123             mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3124                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3125                 return (0);
3126         }
3127
3128         dtrace_bcopy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3129             strsz);
3130
3131         /* Replace sub-string termination characters with a space. */
3132         for (p = (char *) mstate->dtms_scratch_ptr, i = 0; i < strsz - 1;
3133             p++, i++)
3134                 if (*p == '\0')
3135                         *p = ' ';
3136
3137         ret = mstate->dtms_scratch_ptr;
3138         mstate->dtms_scratch_ptr += strsz;
3139         return (ret);
3140 }
3141
3142 /*
3143  * This function implements the DIF emulator's variable lookups.  The emulator
3144  * passes a reserved variable identifier and optional built-in array index.
3145  */
3146 static uint64_t
3147 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3148     uint64_t ndx)
3149 {
3150         /*
3151          * If we're accessing one of the uncached arguments, we'll turn this
3152          * into a reference in the args array.
3153          */
3154         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3155                 ndx = v - DIF_VAR_ARG0;
3156                 v = DIF_VAR_ARGS;
3157         }
3158
3159         switch (v) {
3160         case DIF_VAR_ARGS:
3161                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3162                 if (ndx >= sizeof (mstate->dtms_arg) /
3163                     sizeof (mstate->dtms_arg[0])) {
3164                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3165                         dtrace_provider_t *pv;
3166                         uint64_t val;
3167
3168                         pv = mstate->dtms_probe->dtpr_provider;
3169                         if (pv->dtpv_pops.dtps_getargval != NULL)
3170                                 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3171                                     mstate->dtms_probe->dtpr_id,
3172                                     mstate->dtms_probe->dtpr_arg, ndx, aframes);
3173                         else
3174                                 val = dtrace_getarg(ndx, aframes);
3175
3176                         /*
3177                          * This is regrettably required to keep the compiler
3178                          * from tail-optimizing the call to dtrace_getarg().
3179                          * The condition always evaluates to true, but the
3180                          * compiler has no way of figuring that out a priori.
3181                          * (None of this would be necessary if the compiler
3182                          * could be relied upon to _always_ tail-optimize
3183                          * the call to dtrace_getarg() -- but it can't.)
3184                          */
3185                         if (mstate->dtms_probe != NULL)
3186                                 return (val);
3187
3188                         ASSERT(0);
3189                 }
3190
3191                 return (mstate->dtms_arg[ndx]);
3192
3193 #if defined(sun)
3194         case DIF_VAR_UREGS: {
3195                 klwp_t *lwp;
3196
3197                 if (!dtrace_priv_proc(state))
3198                         return (0);
3199
3200                 if ((lwp = curthread->t_lwp) == NULL) {
3201                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3202                         cpu_core[curcpu].cpuc_dtrace_illval = NULL;
3203                         return (0);
3204                 }
3205
3206                 return (dtrace_getreg(lwp->lwp_regs, ndx));
3207                 return (0);
3208         }
3209 #else
3210         case DIF_VAR_UREGS: {
3211                 struct trapframe *tframe;
3212
3213                 if (!dtrace_priv_proc(state))
3214                         return (0);
3215
3216                 if ((tframe = curthread->td_frame) == NULL) {
3217                         DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3218                         cpu_core[curcpu].cpuc_dtrace_illval = 0;
3219                         return (0);
3220                 }
3221
3222                 return (dtrace_getreg(tframe, ndx));
3223         }
3224 #endif
3225
3226         case DIF_VAR_CURTHREAD:
3227                 if (!dtrace_priv_proc(state))
3228                         return (0);
3229                 return ((uint64_t)(uintptr_t)curthread);
3230
3231         case DIF_VAR_TIMESTAMP:
3232                 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3233                         mstate->dtms_timestamp = dtrace_gethrtime();
3234                         mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3235                 }
3236                 return (mstate->dtms_timestamp);
3237
3238         case DIF_VAR_VTIMESTAMP:
3239                 ASSERT(dtrace_vtime_references != 0);
3240                 return (curthread->t_dtrace_vtime);
3241
3242         case DIF_VAR_WALLTIMESTAMP:
3243                 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3244                         mstate->dtms_walltimestamp = dtrace_gethrestime();
3245                         mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3246                 }
3247                 return (mstate->dtms_walltimestamp);
3248
3249 #if defined(sun)
3250         case DIF_VAR_IPL:
3251                 if (!dtrace_priv_kernel(state))
3252                         return (0);
3253                 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3254                         mstate->dtms_ipl = dtrace_getipl();
3255                         mstate->dtms_present |= DTRACE_MSTATE_IPL;
3256                 }
3257                 return (mstate->dtms_ipl);
3258 #endif
3259
3260         case DIF_VAR_EPID:
3261                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3262                 return (mstate->dtms_epid);
3263
3264         case DIF_VAR_ID:
3265                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3266                 return (mstate->dtms_probe->dtpr_id);
3267
3268         case DIF_VAR_STACKDEPTH:
3269                 if (!dtrace_priv_kernel(state))
3270                         return (0);
3271                 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3272                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3273
3274                         mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3275                         mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3276                 }
3277                 return (mstate->dtms_stackdepth);
3278
3279         case DIF_VAR_USTACKDEPTH:
3280                 if (!dtrace_priv_proc(state))
3281                         return (0);
3282                 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3283                         /*
3284                          * See comment in DIF_VAR_PID.
3285                          */
3286                         if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3287                             CPU_ON_INTR(CPU)) {
3288                                 mstate->dtms_ustackdepth = 0;
3289                         } else {
3290                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3291                                 mstate->dtms_ustackdepth =
3292                                     dtrace_getustackdepth();
3293                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3294                         }
3295                         mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3296                 }
3297                 return (mstate->dtms_ustackdepth);
3298
3299         case DIF_VAR_CALLER:
3300                 if (!dtrace_priv_kernel(state))
3301                         return (0);
3302                 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3303                         int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3304
3305                         if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3306                                 /*
3307                                  * If this is an unanchored probe, we are
3308                                  * required to go through the slow path:
3309                                  * dtrace_caller() only guarantees correct
3310                                  * results for anchored probes.
3311                                  */
3312                                 pc_t caller[2] = {0, 0};
3313
3314                                 dtrace_getpcstack(caller, 2, aframes,
3315                                     (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3316                                 mstate->dtms_caller = caller[1];
3317                         } else if ((mstate->dtms_caller =
3318                             dtrace_caller(aframes)) == -1) {
3319                                 /*
3320                                  * We have failed to do this the quick way;
3321                                  * we must resort to the slower approach of
3322                                  * calling dtrace_getpcstack().
3323                                  */
3324                                 pc_t caller = 0;
3325
3326                                 dtrace_getpcstack(&caller, 1, aframes, NULL);
3327                                 mstate->dtms_caller = caller;
3328                         }
3329
3330                         mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3331                 }
3332                 return (mstate->dtms_caller);
3333
3334         case DIF_VAR_UCALLER:
3335                 if (!dtrace_priv_proc(state))
3336                         return (0);
3337
3338                 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3339                         uint64_t ustack[3];
3340
3341                         /*
3342                          * dtrace_getupcstack() fills in the first uint64_t
3343                          * with the current PID.  The second uint64_t will
3344                          * be the program counter at user-level.  The third
3345                          * uint64_t will contain the caller, which is what
3346                          * we're after.
3347                          */
3348                         ustack[2] = 0;
3349                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3350                         dtrace_getupcstack(ustack, 3);
3351                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3352                         mstate->dtms_ucaller = ustack[2];
3353                         mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3354                 }
3355
3356                 return (mstate->dtms_ucaller);
3357
3358         case DIF_VAR_PROBEPROV:
3359                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3360                 return (dtrace_dif_varstr(
3361                     (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3362                     state, mstate));
3363
3364         case DIF_VAR_PROBEMOD:
3365                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3366                 return (dtrace_dif_varstr(
3367                     (uintptr_t)mstate->dtms_probe->dtpr_mod,
3368                     state, mstate));
3369
3370         case DIF_VAR_PROBEFUNC:
3371                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3372                 return (dtrace_dif_varstr(
3373                     (uintptr_t)mstate->dtms_probe->dtpr_func,
3374                     state, mstate));
3375
3376         case DIF_VAR_PROBENAME:
3377                 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3378                 return (dtrace_dif_varstr(
3379                     (uintptr_t)mstate->dtms_probe->dtpr_name,
3380                     state, mstate));
3381
3382         case DIF_VAR_PID:
3383                 if (!dtrace_priv_proc(state))
3384                         return (0);
3385
3386 #if defined(sun)
3387                 /*
3388                  * Note that we are assuming that an unanchored probe is
3389                  * always due to a high-level interrupt.  (And we're assuming
3390                  * that there is only a single high level interrupt.)
3391                  */
3392                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3393                         return (pid0.pid_id);
3394
3395                 /*
3396                  * It is always safe to dereference one's own t_procp pointer:
3397                  * it always points to a valid, allocated proc structure.
3398                  * Further, it is always safe to dereference the p_pidp member
3399                  * of one's own proc structure.  (These are truisms becuase
3400                  * threads and processes don't clean up their own state --
3401                  * they leave that task to whomever reaps them.)
3402                  */
3403                 return ((uint64_t)curthread->t_procp->p_pidp->pid_id);
3404 #else
3405                 return ((uint64_t)curproc->p_pid);
3406 #endif
3407
3408         case DIF_VAR_PPID:
3409                 if (!dtrace_priv_proc(state))
3410                         return (0);
3411
3412 #if defined(sun)
3413                 /*
3414                  * See comment in DIF_VAR_PID.
3415                  */
3416                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3417                         return (pid0.pid_id);
3418
3419                 /*
3420                  * It is always safe to dereference one's own t_procp pointer:
3421                  * it always points to a valid, allocated proc structure.
3422                  * (This is true because threads don't clean up their own
3423                  * state -- they leave that task to whomever reaps them.)
3424                  */
3425                 return ((uint64_t)curthread->t_procp->p_ppid);
3426 #else
3427                 if (curproc->p_pid == proc0.p_pid)
3428                         return (curproc->p_pid);
3429                 else
3430                         return (curproc->p_pptr->p_pid);
3431 #endif
3432
3433         case DIF_VAR_TID:
3434 #if defined(sun)
3435                 /*
3436                  * See comment in DIF_VAR_PID.
3437                  */
3438                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3439                         return (0);
3440 #endif
3441
3442                 return ((uint64_t)curthread->t_tid);
3443
3444         case DIF_VAR_EXECARGS: {
3445                 struct pargs *p_args = curthread->td_proc->p_args;
3446
3447                 if (p_args == NULL)
3448                         return(0);
3449
3450                 return (dtrace_dif_varstrz(
3451                     (uintptr_t) p_args->ar_args, p_args->ar_length, state, mstate));
3452         }
3453
3454         case DIF_VAR_EXECNAME:
3455 #if defined(sun)
3456                 if (!dtrace_priv_proc(state))
3457                         return (0);
3458
3459                 /*
3460                  * See comment in DIF_VAR_PID.
3461                  */
3462                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3463                         return ((uint64_t)(uintptr_t)p0.p_user.u_comm);
3464
3465                 /*
3466                  * It is always safe to dereference one's own t_procp pointer:
3467                  * it always points to a valid, allocated proc structure.
3468                  * (This is true because threads don't clean up their own
3469                  * state -- they leave that task to whomever reaps them.)
3470                  */
3471                 return (dtrace_dif_varstr(
3472                     (uintptr_t)curthread->t_procp->p_user.u_comm,
3473                     state, mstate));
3474 #else
3475                 return (dtrace_dif_varstr(
3476                     (uintptr_t) curthread->td_proc->p_comm, state, mstate));
3477 #endif
3478
3479         case DIF_VAR_ZONENAME:
3480 #if defined(sun)
3481                 if (!dtrace_priv_proc(state))
3482                         return (0);
3483
3484                 /*
3485                  * See comment in DIF_VAR_PID.
3486                  */
3487                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3488                         return ((uint64_t)(uintptr_t)p0.p_zone->zone_name);
3489
3490                 /*
3491                  * It is always safe to dereference one's own t_procp pointer:
3492                  * it always points to a valid, allocated proc structure.
3493                  * (This is true because threads don't clean up their own
3494                  * state -- they leave that task to whomever reaps them.)
3495                  */
3496                 return (dtrace_dif_varstr(
3497                     (uintptr_t)curthread->t_procp->p_zone->zone_name,
3498                     state, mstate));
3499 #else
3500                 return (0);
3501 #endif
3502
3503         case DIF_VAR_UID:
3504                 if (!dtrace_priv_proc(state))
3505                         return (0);
3506
3507 #if defined(sun)
3508                 /*
3509                  * See comment in DIF_VAR_PID.
3510                  */
3511                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3512                         return ((uint64_t)p0.p_cred->cr_uid);
3513
3514                 /*
3515                  * It is always safe to dereference one's own t_procp pointer:
3516                  * it always points to a valid, allocated proc structure.
3517                  * (This is true because threads don't clean up their own
3518                  * state -- they leave that task to whomever reaps them.)
3519                  *
3520                  * Additionally, it is safe to dereference one's own process
3521                  * credential, since this is never NULL after process birth.
3522                  */
3523                 return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
3524 #else
3525                 return ((uint64_t)curthread->td_ucred->cr_uid);
3526 #endif
3527
3528         case DIF_VAR_GID:
3529                 if (!dtrace_priv_proc(state))
3530                         return (0);
3531
3532 #if defined(sun)
3533                 /*
3534                  * See comment in DIF_VAR_PID.
3535                  */
3536                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3537                         return ((uint64_t)p0.p_cred->cr_gid);
3538
3539                 /*
3540                  * It is always safe to dereference one's own t_procp pointer:
3541                  * it always points to a valid, allocated proc structure.
3542                  * (This is true because threads don't clean up their own
3543                  * state -- they leave that task to whomever reaps them.)
3544                  *
3545                  * Additionally, it is safe to dereference one's own process
3546                  * credential, since this is never NULL after process birth.
3547                  */
3548                 return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
3549 #else
3550                 return ((uint64_t)curthread->td_ucred->cr_gid);
3551 #endif
3552
3553         case DIF_VAR_ERRNO: {
3554 #if defined(sun)
3555                 klwp_t *lwp;
3556                 if (!dtrace_priv_proc(state))
3557                         return (0);
3558
3559                 /*
3560                  * See comment in DIF_VAR_PID.
3561                  */
3562                 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3563                         return (0);
3564
3565                 /*
3566                  * It is always safe to dereference one's own t_lwp pointer in
3567                  * the event that this pointer is non-NULL.  (This is true
3568                  * because threads and lwps don't clean up their own state --
3569                  * they leave that task to whomever reaps them.)
3570                  */
3571                 if ((lwp = curthread->t_lwp) == NULL)
3572                         return (0);
3573
3574                 return ((uint64_t)lwp->lwp_errno);
3575 #else
3576                 return (curthread->td_errno);
3577 #endif
3578         }
3579 #if !defined(sun)
3580         case DIF_VAR_CPU: {
3581                 return curcpu;
3582         }
3583 #endif
3584         default:
3585                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3586                 return (0);
3587         }
3588 }
3589
3590
3591 typedef enum dtrace_json_state {
3592         DTRACE_JSON_REST = 1,
3593         DTRACE_JSON_OBJECT,
3594         DTRACE_JSON_STRING,
3595         DTRACE_JSON_STRING_ESCAPE,
3596         DTRACE_JSON_STRING_ESCAPE_UNICODE,
3597         DTRACE_JSON_COLON,
3598         DTRACE_JSON_COMMA,
3599         DTRACE_JSON_VALUE,
3600         DTRACE_JSON_IDENTIFIER,
3601         DTRACE_JSON_NUMBER,
3602         DTRACE_JSON_NUMBER_FRAC,
3603         DTRACE_JSON_NUMBER_EXP,
3604         DTRACE_JSON_COLLECT_OBJECT
3605 } dtrace_json_state_t;
3606
3607 /*
3608  * This function possesses just enough knowledge about JSON to extract a single
3609  * value from a JSON string and store it in the scratch buffer.  It is able
3610  * to extract nested object values, and members of arrays by index.
3611  *
3612  * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3613  * be looked up as we descend into the object tree.  e.g.
3614  *
3615  *    foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3616  *       with nelems = 5.
3617  *
3618  * The run time of this function must be bounded above by strsize to limit the
3619  * amount of work done in probe context.  As such, it is implemented as a
3620  * simple state machine, reading one character at a time using safe loads
3621  * until we find the requested element, hit a parsing error or run off the
3622  * end of the object or string.
3623  *
3624  * As there is no way for a subroutine to return an error without interrupting
3625  * clause execution, we simply return NULL in the event of a missing key or any
3626  * other error condition.  Each NULL return in this function is commented with
3627  * the error condition it represents -- parsing or otherwise.
3628  *
3629  * The set of states for the state machine closely matches the JSON
3630  * specification (http://json.org/).  Briefly:
3631  *
3632  *   DTRACE_JSON_REST:
3633  *     Skip whitespace until we find either a top-level Object, moving
3634  *     to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3635  *
3636  *   DTRACE_JSON_OBJECT:
3637  *     Locate the next key String in an Object.  Sets a flag to denote
3638  *     the next String as a key string and moves to DTRACE_JSON_STRING.
3639  *
3640  *   DTRACE_JSON_COLON:
3641  *     Skip whitespace until we find the colon that separates key Strings
3642  *     from their values.  Once found, move to DTRACE_JSON_VALUE.
3643  *
3644  *   DTRACE_JSON_VALUE:
3645  *     Detects the type of the next value (String, Number, Identifier, Object
3646  *     or Array) and routes to the states that process that type.  Here we also
3647  *     deal with the element selector list if we are requested to traverse down
3648  *     into the object tree.
3649  *
3650  *   DTRACE_JSON_COMMA:
3651  *     Skip whitespace until we find the comma that separates key-value pairs
3652  *     in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3653  *     (similarly DTRACE_JSON_VALUE).  All following literal value processing
3654  *     states return to this state at the end of their value, unless otherwise
3655  *     noted.
3656  *
3657  *   DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3658  *     Processes a Number literal from the JSON, including any exponent
3659  *     component that may be present.  Numbers are returned as strings, which
3660  *     may be passed to strtoll() if an integer is required.
3661  *
3662  *   DTRACE_JSON_IDENTIFIER:
3663  *     Processes a "true", "false" or "null" literal in the JSON.
3664  *
3665  *   DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3666  *   DTRACE_JSON_STRING_ESCAPE_UNICODE:
3667  *     Processes a String literal from the JSON, whether the String denotes
3668  *     a key, a value or part of a larger Object.  Handles all escape sequences
3669  *     present in the specification, including four-digit unicode characters,
3670  *     but merely includes the escape sequence without converting it to the
3671  *     actual escaped character.  If the String is flagged as a key, we
3672  *     move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3673  *
3674  *   DTRACE_JSON_COLLECT_OBJECT:
3675  *     This state collects an entire Object (or Array), correctly handling
3676  *     embedded strings.  If the full element selector list matches this nested
3677  *     object, we return the Object in full as a string.  If not, we use this
3678  *     state to skip to the next value at this level and continue processing.
3679  *
3680  * NOTE: This function uses various macros from strtolctype.h to manipulate
3681  * digit values, etc -- these have all been checked to ensure they make
3682  * no additional function calls.
3683  */
3684 static char *
3685 dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3686     char *dest)
3687 {
3688         dtrace_json_state_t state = DTRACE_JSON_REST;
3689         int64_t array_elem = INT64_MIN;
3690         int64_t array_pos = 0;
3691         uint8_t escape_unicount = 0;
3692         boolean_t string_is_key = B_FALSE;
3693         boolean_t collect_object = B_FALSE;
3694         boolean_t found_key = B_FALSE;
3695         boolean_t in_array = B_FALSE;
3696         uint32_t braces = 0, brackets = 0;
3697         char *elem = elemlist;
3698         char *dd = dest;
3699         uintptr_t cur;
3700
3701         for (cur = json; cur < json + size; cur++) {
3702                 char cc = dtrace_load8(cur);
3703                 if (cc == '\0')
3704                         return (NULL);
3705
3706                 switch (state) {
3707                 case DTRACE_JSON_REST:
3708                         if (isspace(cc))
3709                                 break;
3710
3711                         if (cc == '{') {
3712                                 state = DTRACE_JSON_OBJECT;
3713                                 break;
3714                         }
3715
3716                         if (cc == '[') {
3717                                 in_array = B_TRUE;
3718                                 array_pos = 0;
3719                                 array_elem = dtrace_strtoll(elem, 10, size);
3720                                 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3721                                 state = DTRACE_JSON_VALUE;
3722                                 break;
3723                         }
3724
3725                         /*
3726                          * ERROR: expected to find a top-level object or array.
3727                          */
3728                         return (NULL);
3729                 case DTRACE_JSON_OBJECT:
3730                         if (isspace(cc))
3731                                 break;
3732
3733                         if (cc == '"') {
3734                                 state = DTRACE_JSON_STRING;
3735                                 string_is_key = B_TRUE;
3736                                 break;
3737                         }
3738
3739                         /*
3740                          * ERROR: either the object did not start with a key
3741                          * string, or we've run off the end of the object
3742                          * without finding the requested key.
3743                          */
3744                         return (NULL);
3745                 case DTRACE_JSON_STRING:
3746                         if (cc == '\\') {
3747                                 *dd++ = '\\';
3748                                 state = DTRACE_JSON_STRING_ESCAPE;
3749                                 break;
3750                         }
3751
3752                         if (cc == '"') {
3753                                 if (collect_object) {
3754                                         /*
3755                                          * We don't reset the dest here, as
3756                                          * the string is part of a larger
3757                                          * object being collected.
3758                                          */
3759                                         *dd++ = cc;
3760                                         collect_object = B_FALSE;
3761                                         state = DTRACE_JSON_COLLECT_OBJECT;
3762                                         break;
3763                                 }
3764                                 *dd = '\0';
3765                                 dd = dest; /* reset string buffer */
3766                                 if (string_is_key) {
3767                                         if (dtrace_strncmp(dest, elem,
3768                                             size) == 0)
3769                                                 found_key = B_TRUE;
3770                                 } else if (found_key) {
3771                                         if (nelems > 1) {
3772                                                 /*
3773                                                  * We expected an object, not
3774                                                  * this string.
3775                                                  */
3776                                                 return (NULL);
3777                                         }
3778                                         return (dest);
3779                                 }
3780                                 state = string_is_key ? DTRACE_JSON_COLON :
3781                                     DTRACE_JSON_COMMA;
3782                                 string_is_key = B_FALSE;
3783                                 break;
3784                         }
3785
3786                         *dd++ = cc;
3787                         break;
3788                 case DTRACE_JSON_STRING_ESCAPE:
3789                         *dd++ = cc;
3790                         if (cc == 'u') {
3791                                 escape_unicount = 0;
3792                                 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3793                         } else {
3794                                 state = DTRACE_JSON_STRING;
3795                         }
3796                         break;
3797                 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3798                         if (!isxdigit(cc)) {
3799                                 /*
3800                                  * ERROR: invalid unicode escape, expected
3801                                  * four valid hexidecimal digits.
3802                                  */
3803                                 return (NULL);
3804                         }
3805
3806                         *dd++ = cc;
3807                         if (++escape_unicount == 4)
3808                                 state = DTRACE_JSON_STRING;
3809                         break;
3810                 case DTRACE_JSON_COLON:
3811                         if (isspace(cc))
3812                                 break;
3813
3814                         if (cc == ':') {
3815                                 state = DTRACE_JSON_VALUE;
3816                                 break;
3817                         }
3818
3819                         /*
3820                          * ERROR: expected a colon.
3821                          */
3822                         return (NULL);
3823                 case DTRACE_JSON_COMMA:
3824                         if (isspace(cc))
3825                                 break;
3826
3827                         if (cc == ',') {
3828                                 if (in_array) {
3829                                         state = DTRACE_JSON_VALUE;
3830                                         if (++array_pos == array_elem)
3831                                                 found_key = B_TRUE;
3832                                 } else {
3833                                         state = DTRACE_JSON_OBJECT;
3834                                 }
3835                                 break;
3836                         }
3837
3838                         /*
3839                          * ERROR: either we hit an unexpected character, or
3840                          * we reached the end of the object or array without
3841                          * finding the requested key.
3842                          */
3843                         return (NULL);
3844                 case DTRACE_JSON_IDENTIFIER:
3845                         if (islower(cc)) {
3846                                 *dd++ = cc;
3847                                 break;
3848                         }
3849
3850                         *dd = '\0';
3851                         dd = dest; /* reset string buffer */
3852
3853                         if (dtrace_strncmp(dest, "true", 5) == 0 ||
3854                             dtrace_strncmp(dest, "false", 6) == 0 ||
3855                             dtrace_strncmp(dest, "null", 5) == 0) {
3856                                 if (found_key) {
3857                                         if (nelems > 1) {
3858                                                 /*
3859                                                  * ERROR: We expected an object,
3860                                                  * not this identifier.
3861                                                  */
3862                                                 return (NULL);
3863                                         }
3864                                         return (dest);
3865                                 } else {
3866                                         cur--;
3867                                         state = DTRACE_JSON_COMMA;
3868                                         break;
3869                                 }
3870                         }
3871
3872                         /*
3873                          * ERROR: we did not recognise the identifier as one
3874                          * of those in the JSON specification.
3875                          */
3876                         return (NULL);
3877                 case DTRACE_JSON_NUMBER:
3878                         if (cc == '.') {
3879                                 *dd++ = cc;
3880                                 state = DTRACE_JSON_NUMBER_FRAC;
3881                                 break;
3882                         }
3883
3884                         if (cc == 'x' || cc == 'X') {
3885                                 /*
3886                                  * ERROR: specification explicitly excludes
3887                                  * hexidecimal or octal numbers.
3888                                  */
3889                                 return (NULL);
3890                         }
3891
3892                         /* FALLTHRU */
3893                 case DTRACE_JSON_NUMBER_FRAC:
3894                         if (cc == 'e' || cc == 'E') {
3895                                 *dd++ = cc;
3896                                 state = DTRACE_JSON_NUMBER_EXP;
3897                                 break;
3898                         }
3899
3900                         if (cc == '+' || cc == '-') {
3901                                 /*
3902                                  * ERROR: expect sign as part of exponent only.
3903                                  */
3904                                 return (NULL);
3905                         }
3906                         /* FALLTHRU */
3907                 case DTRACE_JSON_NUMBER_EXP:
3908                         if (isdigit(cc) || cc == '+' || cc == '-') {
3909                                 *dd++ = cc;
3910                                 break;
3911                         }
3912
3913                         *dd = '\0';
3914                         dd = dest; /* reset string buffer */
3915                         if (found_key) {
3916                                 if (nelems > 1) {
3917                                         /*
3918                                          * ERROR: We expected an object, not
3919                                          * this number.
3920                                          */
3921                                         return (NULL);
3922                                 }
3923                                 return (dest);
3924                         }
3925
3926                         cur--;
3927                         state = DTRACE_JSON_COMMA;
3928                         break;
3929                 case DTRACE_JSON_VALUE:
3930                         if (isspace(cc))
3931                                 break;
3932
3933                         if (cc == '{' || cc == '[') {
3934                                 if (nelems > 1 && found_key) {
3935                                         in_array = cc == '[' ? B_TRUE : B_FALSE;
3936                                         /*
3937                                          * If our element selector directs us
3938                                          * to descend into this nested object,
3939                                          * then move to the next selector
3940                                          * element in the list and restart the
3941                                          * state machine.
3942                                          */
3943                                         while (*elem != '\0')
3944                                                 elem++;
3945                                         elem++; /* skip the inter-element NUL */
3946                                         nelems--;
3947                                         dd = dest;
3948                                         if (in_array) {
3949                                                 state = DTRACE_JSON_VALUE;
3950                                                 array_pos = 0;
3951                                                 array_elem = dtrace_strtoll(
3952                                                     elem, 10, size);
3953                                                 found_key = array_elem == 0 ?
3954                                                     B_TRUE : B_FALSE;
3955                                         } else {
3956                                                 found_key = B_FALSE;
3957                                                 state = DTRACE_JSON_OBJECT;
3958                                         }
3959                                         break;
3960                                 }
3961
3962                                 /*
3963                                  * Otherwise, we wish to either skip this
3964                                  * nested object or return it in full.
3965                                  */
3966                                 if (cc == '[')
3967                                         brackets = 1;
3968                                 else
3969                                         braces = 1;
3970                                 *dd++ = cc;
3971                                 state = DTRACE_JSON_COLLECT_OBJECT;
3972                                 break;
3973                         }
3974
3975                         if (cc == '"') {
3976                                 state = DTRACE_JSON_STRING;
3977                                 break;
3978                         }
3979
3980                         if (islower(cc)) {
3981                                 /*
3982                                  * Here we deal with true, false and null.
3983                                  */
3984                                 *dd++ = cc;
3985                                 state = DTRACE_JSON_IDENTIFIER;
3986                                 break;
3987                         }
3988
3989                         if (cc == '-' || isdigit(cc)) {
3990                                 *dd++ = cc;
3991                                 state = DTRACE_JSON_NUMBER;
3992                                 break;
3993                         }
3994
3995                         /*
3996                          * ERROR: unexpected character at start of value.
3997                          */
3998                         return (NULL);
3999                 case DTRACE_JSON_COLLECT_OBJECT:
4000                         if (cc == '\0')
4001                                 /*
4002                                  * ERROR: unexpected end of input.
4003                                  */
4004                                 return (NULL);
4005
4006                         *dd++ = cc;
4007                         if (cc == '"') {
4008                                 collect_object = B_TRUE;
4009                                 state = DTRACE_JSON_STRING;
4010                                 break;
4011                         }
4012
4013                         if (cc == ']') {
4014                                 if (brackets-- == 0) {
4015                                         /*
4016                                          * ERROR: unbalanced brackets.
4017                                          */
4018                                         return (NULL);
4019                                 }
4020                         } else if (cc == '}') {
4021                                 if (braces-- == 0) {
4022                                         /*
4023                                          * ERROR: unbalanced braces.
4024                                          */
4025                                         return (NULL);
4026                                 }
4027                         } else if (cc == '{') {
4028                                 braces++;
4029                         } else if (cc == '[') {
4030                                 brackets++;
4031                         }
4032
4033                         if (brackets == 0 && braces == 0) {
4034                                 if (found_key) {
4035                                         *dd = '\0';
4036                                         return (dest);
4037                                 }
4038                                 dd = dest; /* reset string buffer */
4039                                 state = DTRACE_JSON_COMMA;
4040                         }
4041                         break;
4042                 }
4043         }
4044         return (NULL);
4045 }
4046
4047 /*
4048  * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4049  * Notice that we don't bother validating the proper number of arguments or
4050  * their types in the tuple stack.  This isn't needed because all argument
4051  * interpretation is safe because of our load safety -- the worst that can
4052  * happen is that a bogus program can obtain bogus results.
4053  */
4054 static void
4055 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4056     dtrace_key_t *tupregs, int nargs,
4057     dtrace_mstate_t *mstate, dtrace_state_t *state)
4058 {
4059         volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
4060         volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
4061         dtrace_vstate_t *vstate = &state->dts_vstate;
4062
4063 #if defined(sun)
4064         union {
4065                 mutex_impl_t mi;
4066                 uint64_t mx;
4067         } m;
4068
4069         union {
4070                 krwlock_t ri;
4071                 uintptr_t rw;
4072         } r;
4073 #else
4074         struct thread *lowner;
4075         union {
4076                 struct lock_object *li;
4077                 uintptr_t lx;
4078         } l;
4079 #endif
4080
4081         switch (subr) {
4082         case DIF_SUBR_RAND:
4083                 regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
4084                 break;
4085
4086 #if defined(sun)
4087         case DIF_SUBR_MUTEX_OWNED:
4088                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4089                     mstate, vstate)) {
4090                         regs[rd] = 0;
4091                         break;
4092                 }
4093
4094                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4095                 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4096                         regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4097                 else
4098                         regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4099                 break;
4100
4101         case DIF_SUBR_MUTEX_OWNER:
4102                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4103                     mstate, vstate)) {
4104                         regs[rd] = 0;
4105                         break;
4106                 }
4107
4108                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4109                 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4110                     MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4111                         regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4112                 else
4113                         regs[rd] = 0;
4114                 break;
4115
4116         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4117                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4118                     mstate, vstate)) {
4119                         regs[rd] = 0;
4120                         break;
4121                 }
4122
4123                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4124                 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4125                 break;
4126
4127         case DIF_SUBR_MUTEX_TYPE_SPIN:
4128                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4129                     mstate, vstate)) {
4130                         regs[rd] = 0;
4131                         break;
4132                 }
4133
4134                 m.mx = dtrace_load64(tupregs[0].dttk_value);
4135                 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4136                 break;
4137
4138         case DIF_SUBR_RW_READ_HELD: {
4139                 uintptr_t tmp;
4140
4141                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4142                     mstate, vstate)) {
4143                         regs[rd] = 0;
4144                         break;
4145                 }
4146
4147                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4148                 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4149                 break;
4150         }
4151
4152         case DIF_SUBR_RW_WRITE_HELD:
4153                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4154                     mstate, vstate)) {
4155                         regs[rd] = 0;
4156                         break;
4157                 }
4158
4159                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4160                 regs[rd] = _RW_WRITE_HELD(&r.ri);
4161                 break;
4162
4163         case DIF_SUBR_RW_ISWRITER:
4164                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4165                     mstate, vstate)) {
4166                         regs[rd] = 0;
4167                         break;
4168                 }
4169
4170                 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4171                 regs[rd] = _RW_ISWRITER(&r.ri);
4172                 break;
4173
4174 #else
4175         case DIF_SUBR_MUTEX_OWNED:
4176                 if (!dtrace_canload(tupregs[0].dttk_value,
4177                         sizeof (struct lock_object), mstate, vstate)) {
4178                         regs[rd] = 0;
4179                         break;
4180                 }
4181                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4182                 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4183                 break;
4184
4185         case DIF_SUBR_MUTEX_OWNER:
4186                 if (!dtrace_canload(tupregs[0].dttk_value,
4187                         sizeof (struct lock_object), mstate, vstate)) {
4188                         regs[rd] = 0;
4189                         break;
4190                 }
4191                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4192                 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4193                 regs[rd] = (uintptr_t)lowner;
4194                 break;
4195
4196         case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4197                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4198                     mstate, vstate)) {
4199                         regs[rd] = 0;
4200                         break;
4201                 }
4202                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4203                 /* XXX - should be only LC_SLEEPABLE? */
4204                 regs[rd] = (LOCK_CLASS(l.li)->lc_flags &
4205                     (LC_SLEEPLOCK | LC_SLEEPABLE)) != 0;
4206                 break;
4207
4208         case DIF_SUBR_MUTEX_TYPE_SPIN:
4209                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (struct mtx),
4210                     mstate, vstate)) {
4211                         regs[rd] = 0;
4212                         break;
4213                 }
4214                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4215                 regs[rd] = (LOCK_CLASS(l.li)->lc_flags & LC_SPINLOCK) != 0;
4216                 break;
4217
4218         case DIF_SUBR_RW_READ_HELD: 
4219         case DIF_SUBR_SX_SHARED_HELD: 
4220                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4221                     mstate, vstate)) {
4222                         regs[rd] = 0;
4223                         break;
4224                 }
4225                 l.lx = dtrace_loadptr((uintptr_t)&tupregs[0].dttk_value);
4226                 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4227                     lowner == NULL;
4228                 break;
4229
4230         case DIF_SUBR_RW_WRITE_HELD:
4231         case DIF_SUBR_SX_EXCLUSIVE_HELD:
4232                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4233                     mstate, vstate)) {
4234                         regs[rd] = 0;
4235                         break;
4236                 }
4237                 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4238                 LOCK_CLASS(l.li)->lc_owner(l.li, &lowner);
4239                 regs[rd] = (lowner == curthread);
4240                 break;
4241
4242         case DIF_SUBR_RW_ISWRITER:
4243         case DIF_SUBR_SX_ISEXCLUSIVE:
4244                 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4245                     mstate, vstate)) {
4246                         regs[rd] = 0;
4247                         break;
4248                 }
4249                 l.lx = dtrace_loadptr(tupregs[0].dttk_value);
4250                 regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
4251                     lowner != NULL;
4252                 break;
4253 #endif /* ! defined(sun) */
4254
4255         case DIF_SUBR_BCOPY: {
4256                 /*
4257                  * We need to be sure that the destination is in the scratch
4258                  * region -- no other region is allowed.
4259                  */
4260                 uintptr_t src = tupregs[0].dttk_value;
4261                 uintptr_t dest = tupregs[1].dttk_value;
4262                 size_t size = tupregs[2].dttk_value;
4263
4264                 if (!dtrace_inscratch(dest, size, mstate)) {
4265                         *flags |= CPU_DTRACE_BADADDR;
4266                         *illval = regs[rd];
4267                         break;
4268                 }
4269
4270                 if (!dtrace_canload(src, size, mstate, vstate)) {
4271                         regs[rd] = 0;
4272                         break;
4273                 }
4274
4275                 dtrace_bcopy((void *)src, (void *)dest, size);
4276                 break;
4277         }
4278
4279         case DIF_SUBR_ALLOCA:
4280         case DIF_SUBR_COPYIN: {
4281                 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4282                 uint64_t size =
4283                     tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4284                 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4285
4286                 /*
4287                  * This action doesn't require any credential checks since
4288                  * probes will not activate in user contexts to which the
4289                  * enabling user does not have permissions.
4290                  */
4291
4292                 /*
4293                  * Rounding up the user allocation size could have overflowed
4294                  * a large, bogus allocation (like -1ULL) to 0.
4295                  */
4296                 if (scratch_size < size ||
4297                     !DTRACE_INSCRATCH(mstate, scratch_size)) {
4298                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4299                         regs[rd] = 0;
4300                         break;
4301                 }
4302
4303                 if (subr == DIF_SUBR_COPYIN) {
4304                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4305                         dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4306                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4307                 }
4308
4309                 mstate->dtms_scratch_ptr += scratch_size;
4310                 regs[rd] = dest;
4311                 break;
4312         }
4313
4314         case DIF_SUBR_COPYINTO: {
4315                 uint64_t size = tupregs[1].dttk_value;
4316                 uintptr_t dest = tupregs[2].dttk_value;
4317
4318                 /*
4319                  * This action doesn't require any credential checks since
4320                  * probes will not activate in user contexts to which the
4321                  * enabling user does not have permissions.
4322                  */
4323                 if (!dtrace_inscratch(dest, size, mstate)) {
4324                         *flags |= CPU_DTRACE_BADADDR;
4325                         *illval = regs[rd];
4326                         break;
4327                 }
4328
4329                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4330                 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4331                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4332                 break;
4333         }
4334
4335         case DIF_SUBR_COPYINSTR: {
4336                 uintptr_t dest = mstate->dtms_scratch_ptr;
4337                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4338
4339                 if (nargs > 1 && tupregs[1].dttk_value < size)
4340                         size = tupregs[1].dttk_value + 1;
4341
4342                 /*
4343                  * This action doesn't require any credential checks since
4344                  * probes will not activate in user contexts to which the
4345                  * enabling user does not have permissions.
4346                  */
4347                 if (!DTRACE_INSCRATCH(mstate, size)) {
4348                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4349                         regs[rd] = 0;
4350                         break;
4351                 }
4352
4353                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4354                 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4355                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4356
4357                 ((char *)dest)[size - 1] = '\0';
4358                 mstate->dtms_scratch_ptr += size;
4359                 regs[rd] = dest;
4360                 break;
4361         }
4362
4363 #if defined(sun)
4364         case DIF_SUBR_MSGSIZE:
4365         case DIF_SUBR_MSGDSIZE: {
4366                 uintptr_t baddr = tupregs[0].dttk_value, daddr;
4367                 uintptr_t wptr, rptr;
4368                 size_t count = 0;
4369                 int cont = 0;
4370
4371                 while (baddr != 0 && !(*flags & CPU_DTRACE_FAULT)) {
4372
4373                         if (!dtrace_canload(baddr, sizeof (mblk_t), mstate,
4374                             vstate)) {
4375                                 regs[rd] = 0;
4376                                 break;
4377                         }
4378
4379                         wptr = dtrace_loadptr(baddr +
4380                             offsetof(mblk_t, b_wptr));
4381
4382                         rptr = dtrace_loadptr(baddr +
4383                             offsetof(mblk_t, b_rptr));
4384
4385                         if (wptr < rptr) {
4386                                 *flags |= CPU_DTRACE_BADADDR;
4387                                 *illval = tupregs[0].dttk_value;
4388                                 break;
4389                         }
4390
4391                         daddr = dtrace_loadptr(baddr +
4392                             offsetof(mblk_t, b_datap));
4393
4394                         baddr = dtrace_loadptr(baddr +
4395                             offsetof(mblk_t, b_cont));
4396
4397                         /*
4398                          * We want to prevent against denial-of-service here,
4399                          * so we're only going to search the list for
4400                          * dtrace_msgdsize_max mblks.
4401                          */
4402                         if (cont++ > dtrace_msgdsize_max) {
4403                                 *flags |= CPU_DTRACE_ILLOP;
4404                                 break;
4405                         }
4406
4407                         if (subr == DIF_SUBR_MSGDSIZE) {
4408                                 if (dtrace_load8(daddr +
4409                                     offsetof(dblk_t, db_type)) != M_DATA)
4410                                         continue;
4411                         }
4412
4413                         count += wptr - rptr;
4414                 }
4415
4416                 if (!(*flags & CPU_DTRACE_FAULT))
4417                         regs[rd] = count;
4418
4419                 break;
4420         }
4421 #endif
4422
4423         case DIF_SUBR_PROGENYOF: {
4424                 pid_t pid = tupregs[0].dttk_value;
4425                 proc_t *p;
4426                 int rval = 0;
4427
4428                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4429
4430                 for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
4431 #if defined(sun)
4432                         if (p->p_pidp->pid_id == pid) {
4433 #else
4434                         if (p->p_pid == pid) {
4435 #endif
4436                                 rval = 1;
4437                                 break;
4438                         }
4439                 }
4440
4441                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4442
4443                 regs[rd] = rval;
4444                 break;
4445         }
4446
4447         case DIF_SUBR_SPECULATION:
4448                 regs[rd] = dtrace_speculation(state);
4449                 break;
4450
4451         case DIF_SUBR_COPYOUT: {
4452                 uintptr_t kaddr = tupregs[0].dttk_value;
4453                 uintptr_t uaddr = tupregs[1].dttk_value;
4454                 uint64_t size = tupregs[2].dttk_value;
4455
4456                 if (!dtrace_destructive_disallow &&
4457                     dtrace_priv_proc_control(state) &&
4458                     !dtrace_istoxic(kaddr, size)) {
4459                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4460                         dtrace_copyout(kaddr, uaddr, size, flags);
4461                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4462                 }
4463                 break;
4464         }
4465
4466         case DIF_SUBR_COPYOUTSTR: {
4467                 uintptr_t kaddr = tupregs[0].dttk_value;
4468                 uintptr_t uaddr = tupregs[1].dttk_value;
4469                 uint64_t size = tupregs[2].dttk_value;
4470
4471                 if (!dtrace_destructive_disallow &&
4472                     dtrace_priv_proc_control(state) &&
4473                     !dtrace_istoxic(kaddr, size)) {
4474                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4475                         dtrace_copyoutstr(kaddr, uaddr, size, flags);
4476                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4477                 }
4478                 break;
4479         }
4480
4481         case DIF_SUBR_STRLEN: {
4482                 size_t sz;
4483                 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4484                 sz = dtrace_strlen((char *)addr,
4485                     state->dts_options[DTRACEOPT_STRSIZE]);
4486
4487                 if (!dtrace_canload(addr, sz + 1, mstate, vstate)) {
4488                         regs[rd] = 0;
4489                         break;
4490                 }
4491
4492                 regs[rd] = sz;
4493
4494                 break;
4495         }
4496
4497         case DIF_SUBR_STRCHR:
4498         case DIF_SUBR_STRRCHR: {
4499                 /*
4500                  * We're going to iterate over the string looking for the
4501                  * specified character.  We will iterate until we have reached
4502                  * the string length or we have found the character.  If this
4503                  * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4504                  * of the specified character instead of the first.
4505                  */
4506                 uintptr_t saddr = tupregs[0].dttk_value;
4507                 uintptr_t addr = tupregs[0].dttk_value;
4508                 uintptr_t limit = addr + state->dts_options[DTRACEOPT_STRSIZE];
4509                 char c, target = (char)tupregs[1].dttk_value;
4510
4511                 for (regs[rd] = 0; addr < limit; addr++) {
4512                         if ((c = dtrace_load8(addr)) == target) {
4513                                 regs[rd] = addr;
4514
4515                                 if (subr == DIF_SUBR_STRCHR)
4516                                         break;
4517                         }
4518
4519                         if (c == '\0')
4520                                 break;
4521                 }
4522
4523                 if (!dtrace_canload(saddr, addr - saddr, mstate, vstate)) {
4524                         regs[rd] = 0;
4525                         break;
4526                 }
4527
4528                 break;
4529         }
4530
4531         case DIF_SUBR_STRSTR:
4532         case DIF_SUBR_INDEX:
4533         case DIF_SUBR_RINDEX: {
4534                 /*
4535                  * We're going to iterate over the string looking for the
4536                  * specified string.  We will iterate until we have reached
4537                  * the string length or we have found the string.  (Yes, this
4538                  * is done in the most naive way possible -- but considering
4539                  * that the string we're searching for is likely to be
4540                  * relatively short, the complexity of Rabin-Karp or similar
4541                  * hardly seems merited.)
4542                  */
4543                 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4544                 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4545                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4546                 size_t len = dtrace_strlen(addr, size);
4547                 size_t sublen = dtrace_strlen(substr, size);
4548                 char *limit = addr + len, *orig = addr;
4549                 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4550                 int inc = 1;
4551
4552                 regs[rd] = notfound;
4553
4554                 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4555                         regs[rd] = 0;
4556                         break;
4557                 }
4558
4559                 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4560                     vstate)) {
4561                         regs[rd] = 0;
4562                         break;
4563                 }
4564
4565                 /*
4566                  * strstr() and index()/rindex() have similar semantics if
4567                  * both strings are the empty string: strstr() returns a
4568                  * pointer to the (empty) string, and index() and rindex()
4569                  * both return index 0 (regardless of any position argument).
4570                  */
4571                 if (sublen == 0 && len == 0) {
4572                         if (subr == DIF_SUBR_STRSTR)
4573                                 regs[rd] = (uintptr_t)addr;
4574                         else
4575                                 regs[rd] = 0;
4576                         break;
4577                 }
4578
4579                 if (subr != DIF_SUBR_STRSTR) {
4580                         if (subr == DIF_SUBR_RINDEX) {
4581                                 limit = orig - 1;
4582                                 addr += len;
4583                                 inc = -1;
4584                         }
4585
4586                         /*
4587                          * Both index() and rindex() take an optional position
4588                          * argument that denotes the starting position.
4589                          */
4590                         if (nargs == 3) {
4591                                 int64_t pos = (int64_t)tupregs[2].dttk_value;
4592
4593                                 /*
4594                                  * If the position argument to index() is
4595                                  * negative, Perl implicitly clamps it at
4596                                  * zero.  This semantic is a little surprising
4597                                  * given the special meaning of negative
4598                                  * positions to similar Perl functions like
4599                                  * substr(), but it appears to reflect a
4600                                  * notion that index() can start from a
4601                                  * negative index and increment its way up to
4602                                  * the string.  Given this notion, Perl's
4603                                  * rindex() is at least self-consistent in
4604                                  * that it implicitly clamps positions greater
4605                                  * than the string length to be the string
4606                                  * length.  Where Perl completely loses
4607                                  * coherence, however, is when the specified
4608                                  * substring is the empty string ("").  In
4609                                  * this case, even if the position is
4610                                  * negative, rindex() returns 0 -- and even if
4611                                  * the position is greater than the length,
4612                                  * index() returns the string length.  These
4613                                  * semantics violate the notion that index()
4614                                  * should never return a value less than the
4615                                  * specified position and that rindex() should
4616                                  * never return a value greater than the
4617                                  * specified position.  (One assumes that
4618                                  * these semantics are artifacts of Perl's
4619                                  * implementation and not the results of
4620                                  * deliberate design -- it beggars belief that
4621                                  * even Larry Wall could desire such oddness.)
4622                                  * While in the abstract one would wish for
4623                                  * consistent position semantics across
4624                                  * substr(), index() and rindex() -- or at the
4625                                  * very least self-consistent position
4626                                  * semantics for index() and rindex() -- we
4627                                  * instead opt to keep with the extant Perl
4628                                  * semantics, in all their broken glory.  (Do
4629                                  * we have more desire to maintain Perl's
4630                                  * semantics than Perl does?  Probably.)
4631                                  */
4632                                 if (subr == DIF_SUBR_RINDEX) {
4633                                         if (pos < 0) {
4634                                                 if (sublen == 0)
4635                                                         regs[rd] = 0;
4636                                                 break;
4637                                         }
4638
4639                                         if (pos > len)
4640                                                 pos = len;
4641                                 } else {
4642                                         if (pos < 0)
4643                                                 pos = 0;
4644
4645                                         if (pos >= len) {
4646                                                 if (sublen == 0)
4647                                                         regs[rd] = len;
4648                                                 break;
4649                                         }
4650                                 }
4651
4652                                 addr = orig + pos;
4653                         }
4654                 }
4655
4656                 for (regs[rd] = notfound; addr != limit; addr += inc) {
4657                         if (dtrace_strncmp(addr, substr, sublen) == 0) {
4658                                 if (subr != DIF_SUBR_STRSTR) {
4659                                         /*
4660                                          * As D index() and rindex() are
4661                                          * modeled on Perl (and not on awk),
4662                                          * we return a zero-based (and not a
4663                                          * one-based) index.  (For you Perl
4664                                          * weenies: no, we're not going to add
4665                                          * $[ -- and shouldn't you be at a con
4666                                          * or something?)
4667                                          */
4668                                         regs[rd] = (uintptr_t)(addr - orig);
4669                                         break;
4670                                 }
4671
4672                                 ASSERT(subr == DIF_SUBR_STRSTR);
4673                                 regs[rd] = (uintptr_t)addr;
4674                                 break;
4675                         }
4676                 }
4677
4678                 break;
4679         }
4680
4681         case DIF_SUBR_STRTOK: {
4682                 uintptr_t addr = tupregs[0].dttk_value;
4683                 uintptr_t tokaddr = tupregs[1].dttk_value;
4684                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4685                 uintptr_t limit, toklimit = tokaddr + size;
4686                 uint8_t c = 0, tokmap[32];       /* 256 / 8 */
4687                 char *dest = (char *)mstate->dtms_scratch_ptr;
4688                 int i;
4689
4690                 /*
4691                  * Check both the token buffer and (later) the input buffer,
4692                  * since both could be non-scratch addresses.
4693                  */
4694                 if (!dtrace_strcanload(tokaddr, size, mstate, vstate)) {
4695                         regs[rd] = 0;
4696                         break;
4697                 }
4698
4699                 if (!DTRACE_INSCRATCH(mstate, size)) {
4700                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4701                         regs[rd] = 0;
4702                         break;
4703                 }
4704
4705                 if (addr == 0) {
4706                         /*
4707                          * If the address specified is NULL, we use our saved
4708                          * strtok pointer from the mstate.  Note that this
4709                          * means that the saved strtok pointer is _only_
4710                          * valid within multiple enablings of the same probe --
4711                          * it behaves like an implicit clause-local variable.
4712                          */
4713                         addr = mstate->dtms_strtok;
4714                 } else {
4715                         /*
4716                          * If the user-specified address is non-NULL we must
4717                          * access check it.  This is the only time we have
4718                          * a chance to do so, since this address may reside
4719                          * in the string table of this clause-- future calls
4720                          * (when we fetch addr from mstate->dtms_strtok)
4721                          * would fail this access check.
4722                          */
4723                         if (!dtrace_strcanload(addr, size, mstate, vstate)) {
4724                                 regs[rd] = 0;
4725                                 break;
4726                         }
4727                 }
4728
4729                 /*
4730                  * First, zero the token map, and then process the token
4731                  * string -- setting a bit in the map for every character
4732                  * found in the token string.
4733                  */
4734                 for (i = 0; i < sizeof (tokmap); i++)
4735                         tokmap[i] = 0;
4736
4737                 for (; tokaddr < toklimit; tokaddr++) {
4738                         if ((c = dtrace_load8(tokaddr)) == '\0')
4739                                 break;
4740
4741                         ASSERT((c >> 3) < sizeof (tokmap));
4742                         tokmap[c >> 3] |= (1 << (c & 0x7));
4743                 }
4744
4745                 for (limit = addr + size; addr < limit; addr++) {
4746                         /*
4747                          * We're looking for a character that is _not_ contained
4748                          * in the token string.
4749                          */
4750                         if ((c = dtrace_load8(addr)) == '\0')
4751                                 break;
4752
4753                         if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4754                                 break;
4755                 }
4756
4757                 if (c == '\0') {
4758                         /*
4759                          * We reached the end of the string without finding
4760                          * any character that was not in the token string.
4761                          * We return NULL in this case, and we set the saved
4762                          * address to NULL as well.
4763                          */
4764                         regs[rd] = 0;
4765                         mstate->dtms_strtok = 0;
4766                         break;
4767                 }
4768
4769                 /*
4770                  * From here on, we're copying into the destination string.
4771                  */
4772                 for (i = 0; addr < limit && i < size - 1; addr++) {
4773                         if ((c = dtrace_load8(addr)) == '\0')
4774                                 break;
4775
4776                         if (tokmap[c >> 3] & (1 << (c & 0x7)))
4777                                 break;
4778
4779                         ASSERT(i < size);
4780                         dest[i++] = c;
4781                 }
4782
4783                 ASSERT(i < size);
4784                 dest[i] = '\0';
4785                 regs[rd] = (uintptr_t)dest;
4786                 mstate->dtms_scratch_ptr += size;
4787                 mstate->dtms_strtok = addr;
4788                 break;
4789         }
4790
4791         case DIF_SUBR_SUBSTR: {
4792                 uintptr_t s = tupregs[0].dttk_value;
4793                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4794                 char *d = (char *)mstate->dtms_scratch_ptr;
4795                 int64_t index = (int64_t)tupregs[1].dttk_value;
4796                 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4797                 size_t len = dtrace_strlen((char *)s, size);
4798                 int64_t i;
4799
4800                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4801                         regs[rd] = 0;
4802                         break;
4803                 }
4804
4805                 if (!DTRACE_INSCRATCH(mstate, size)) {
4806                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4807                         regs[rd] = 0;
4808                         break;
4809                 }
4810
4811                 if (nargs <= 2)
4812                         remaining = (int64_t)size;
4813
4814                 if (index < 0) {
4815                         index += len;
4816
4817                         if (index < 0 && index + remaining > 0) {
4818                                 remaining += index;
4819                                 index = 0;
4820                         }
4821                 }
4822
4823                 if (index >= len || index < 0) {
4824                         remaining = 0;
4825                 } else if (remaining < 0) {
4826                         remaining += len - index;
4827                 } else if (index + remaining > size) {
4828                         remaining = size - index;
4829                 }
4830
4831                 for (i = 0; i < remaining; i++) {
4832                         if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4833                                 break;
4834                 }
4835
4836                 d[i] = '\0';
4837
4838                 mstate->dtms_scratch_ptr += size;
4839                 regs[rd] = (uintptr_t)d;
4840                 break;
4841         }
4842
4843         case DIF_SUBR_JSON: {
4844                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4845                 uintptr_t json = tupregs[0].dttk_value;
4846                 size_t jsonlen = dtrace_strlen((char *)json, size);
4847                 uintptr_t elem = tupregs[1].dttk_value;
4848                 size_t elemlen = dtrace_strlen((char *)elem, size);
4849
4850                 char *dest = (char *)mstate->dtms_scratch_ptr;
4851                 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
4852                 char *ee = elemlist;
4853                 int nelems = 1;
4854                 uintptr_t cur;
4855
4856                 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
4857                     !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
4858                         regs[rd] = 0;
4859                         break;
4860                 }
4861
4862                 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
4863                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4864                         regs[rd] = 0;
4865                         break;
4866                 }
4867
4868                 /*
4869                  * Read the element selector and split it up into a packed list
4870                  * of strings.
4871                  */
4872                 for (cur = elem; cur < elem + elemlen; cur++) {
4873                         char cc = dtrace_load8(cur);
4874
4875                         if (cur == elem && cc == '[') {
4876                                 /*
4877                                  * If the first element selector key is
4878                                  * actually an array index then ignore the
4879                                  * bracket.
4880                                  */
4881                                 continue;
4882                         }
4883
4884                         if (cc == ']')
4885                                 continue;
4886
4887                         if (cc == '.' || cc == '[') {
4888                                 nelems++;
4889                                 cc = '\0';
4890                         }
4891
4892                         *ee++ = cc;
4893                 }
4894                 *ee++ = '\0';
4895
4896                 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
4897                     nelems, dest)) != 0)
4898                         mstate->dtms_scratch_ptr += jsonlen + 1;
4899                 break;
4900         }
4901
4902         case DIF_SUBR_TOUPPER:
4903         case DIF_SUBR_TOLOWER: {
4904                 uintptr_t s = tupregs[0].dttk_value;
4905                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4906                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
4907                 size_t len = dtrace_strlen((char *)s, size);
4908                 char lower, upper, convert;
4909                 int64_t i;
4910
4911                 if (subr == DIF_SUBR_TOUPPER) {
4912                         lower = 'a';
4913                         upper = 'z';
4914                         convert = 'A';
4915                 } else {
4916                         lower = 'A';
4917                         upper = 'Z';
4918                         convert = 'a';
4919                 }
4920
4921                 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4922                         regs[rd] = 0;
4923                         break;
4924                 }
4925
4926                 if (!DTRACE_INSCRATCH(mstate, size)) {
4927                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4928                         regs[rd] = 0;
4929                         break;
4930                 }
4931
4932                 for (i = 0; i < size - 1; i++) {
4933                         if ((c = dtrace_load8(s + i)) == '\0')
4934                                 break;
4935
4936                         if (c >= lower && c <= upper)
4937                                 c = convert + (c - lower);
4938
4939                         dest[i] = c;
4940                 }
4941
4942                 ASSERT(i < size);
4943                 dest[i] = '\0';
4944                 regs[rd] = (uintptr_t)dest;
4945                 mstate->dtms_scratch_ptr += size;
4946                 break;
4947         }
4948
4949 #if defined(sun)
4950         case DIF_SUBR_GETMAJOR:
4951 #ifdef _LP64
4952                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
4953 #else
4954                 regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR) & MAXMAJ;
4955 #endif
4956                 break;
4957
4958         case DIF_SUBR_GETMINOR:
4959 #ifdef _LP64
4960                 regs[rd] = tupregs[0].dttk_value & MAXMIN64;
4961 #else
4962                 regs[rd] = tupregs[0].dttk_value & MAXMIN;
4963 #endif
4964                 break;
4965
4966         case DIF_SUBR_DDI_PATHNAME: {
4967                 /*
4968                  * This one is a galactic mess.  We are going to roughly
4969                  * emulate ddi_pathname(), but it's made more complicated
4970                  * by the fact that we (a) want to include the minor name and
4971                  * (b) must proceed iteratively instead of recursively.
4972                  */
4973                 uintptr_t dest = mstate->dtms_scratch_ptr;
4974                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4975                 char *start = (char *)dest, *end = start + size - 1;
4976                 uintptr_t daddr = tupregs[0].dttk_value;
4977                 int64_t minor = (int64_t)tupregs[1].dttk_value;
4978                 char *s;
4979                 int i, len, depth = 0;
4980
4981                 /*
4982                  * Due to all the pointer jumping we do and context we must
4983                  * rely upon, we just mandate that the user must have kernel
4984                  * read privileges to use this routine.
4985                  */
4986                 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) == 0) {
4987                         *flags |= CPU_DTRACE_KPRIV;
4988                         *illval = daddr;
4989                         regs[rd] = 0;
4990                 }
4991
4992                 if (!DTRACE_INSCRATCH(mstate, size)) {
4993                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4994                         regs[rd] = 0;
4995                         break;
4996                 }
4997
4998                 *end = '\0';
4999
5000                 /*
5001                  * We want to have a name for the minor.  In order to do this,
5002                  * we need to walk the minor list from the devinfo.  We want
5003                  * to be sure that we don't infinitely walk a circular list,
5004                  * so we check for circularity by sending a scout pointer
5005                  * ahead two elements for every element that we iterate over;
5006                  * if the list is circular, these will ultimately point to the
5007                  * same element.  You may recognize this little trick as the
5008                  * answer to a stupid interview question -- one that always
5009                  * seems to be asked by those who had to have it laboriously
5010                  * explained to them, and who can't even concisely describe
5011                  * the conditions under which one would be forced to resort to
5012                  * this technique.  Needless to say, those conditions are
5013                  * found here -- and probably only here.  Is this the only use
5014                  * of this infamous trick in shipping, production code?  If it
5015                  * isn't, it probably should be...
5016                  */
5017                 if (minor != -1) {
5018                         uintptr_t maddr = dtrace_loadptr(daddr +
5019                             offsetof(struct dev_info, devi_minor));
5020
5021                         uintptr_t next = offsetof(struct ddi_minor_data, next);
5022                         uintptr_t name = offsetof(struct ddi_minor_data,
5023                             d_minor) + offsetof(struct ddi_minor, name);
5024                         uintptr_t dev = offsetof(struct ddi_minor_data,
5025                             d_minor) + offsetof(struct ddi_minor, dev);
5026                         uintptr_t scout;
5027
5028                         if (maddr != NULL)
5029                                 scout = dtrace_loadptr(maddr + next);
5030
5031                         while (maddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5032                                 uint64_t m;
5033 #ifdef _LP64
5034                                 m = dtrace_load64(maddr + dev) & MAXMIN64;
5035 #else
5036                                 m = dtrace_load32(maddr + dev) & MAXMIN;
5037 #endif
5038                                 if (m != minor) {
5039                                         maddr = dtrace_loadptr(maddr + next);
5040
5041                                         if (scout == NULL)
5042                                                 continue;
5043
5044                                         scout = dtrace_loadptr(scout + next);
5045
5046                                         if (scout == NULL)
5047                                                 continue;
5048
5049                                         scout = dtrace_loadptr(scout + next);
5050
5051                                         if (scout == NULL)
5052                                                 continue;
5053
5054                                         if (scout == maddr) {
5055                                                 *flags |= CPU_DTRACE_ILLOP;
5056                                                 break;
5057                                         }
5058
5059                                         continue;
5060                                 }
5061
5062                                 /*
5063                                  * We have the minor data.  Now we need to
5064                                  * copy the minor's name into the end of the
5065                                  * pathname.
5066                                  */
5067                                 s = (char *)dtrace_loadptr(maddr + name);
5068                                 len = dtrace_strlen(s, size);
5069
5070                                 if (*flags & CPU_DTRACE_FAULT)
5071                                         break;
5072
5073                                 if (len != 0) {
5074                                         if ((end -= (len + 1)) < start)
5075                                                 break;
5076
5077                                         *end = ':';
5078                                 }
5079
5080                                 for (i = 1; i <= len; i++)
5081                                         end[i] = dtrace_load8((uintptr_t)s++);
5082                                 break;
5083                         }
5084                 }
5085
5086                 while (daddr != NULL && !(*flags & CPU_DTRACE_FAULT)) {
5087                         ddi_node_state_t devi_state;
5088
5089                         devi_state = dtrace_load32(daddr +
5090                             offsetof(struct dev_info, devi_node_state));
5091
5092                         if (*flags & CPU_DTRACE_FAULT)
5093                                 break;
5094
5095                         if (devi_state >= DS_INITIALIZED) {
5096                                 s = (char *)dtrace_loadptr(daddr +
5097                                     offsetof(struct dev_info, devi_addr));
5098                                 len = dtrace_strlen(s, size);
5099
5100                                 if (*flags & CPU_DTRACE_FAULT)
5101                                         break;
5102
5103                                 if (len != 0) {
5104                                         if ((end -= (len + 1)) < start)
5105                                                 break;
5106
5107                                         *end = '@';
5108                                 }
5109
5110                                 for (i = 1; i <= len; i++)
5111                                         end[i] = dtrace_load8((uintptr_t)s++);
5112                         }
5113
5114                         /*
5115                          * Now for the node name...
5116                          */
5117                         s = (char *)dtrace_loadptr(daddr +
5118                             offsetof(struct dev_info, devi_node_name));
5119
5120                         daddr = dtrace_loadptr(daddr +
5121                             offsetof(struct dev_info, devi_parent));
5122
5123                         /*
5124                          * If our parent is NULL (that is, if we're the root
5125                          * node), we're going to use the special path
5126                          * "devices".
5127                          */
5128                         if (daddr == 0)
5129                                 s = "devices";
5130
5131                         len = dtrace_strlen(s, size);
5132                         if (*flags & CPU_DTRACE_FAULT)
5133                                 break;
5134
5135                         if ((end -= (len + 1)) < start)
5136                                 break;
5137
5138                         for (i = 1; i <= len; i++)
5139                                 end[i] = dtrace_load8((uintptr_t)s++);
5140                         *end = '/';
5141
5142                         if (depth++ > dtrace_devdepth_max) {
5143                                 *flags |= CPU_DTRACE_ILLOP;
5144                                 break;
5145                         }
5146                 }
5147
5148                 if (end < start)
5149                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5150
5151                 if (daddr == 0) {
5152                         regs[rd] = (uintptr_t)end;
5153                         mstate->dtms_scratch_ptr += size;
5154                 }
5155
5156                 break;
5157         }
5158 #endif
5159
5160         case DIF_SUBR_STRJOIN: {
5161                 char *d = (char *)mstate->dtms_scratch_ptr;
5162                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5163                 uintptr_t s1 = tupregs[0].dttk_value;
5164                 uintptr_t s2 = tupregs[1].dttk_value;
5165                 int i = 0;
5166
5167                 if (!dtrace_strcanload(s1, size, mstate, vstate) ||
5168                     !dtrace_strcanload(s2, size, mstate, vstate)) {
5169                         regs[rd] = 0;
5170                         break;
5171                 }
5172
5173                 if (!DTRACE_INSCRATCH(mstate, size)) {
5174                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5175                         regs[rd] = 0;
5176                         break;
5177                 }
5178
5179                 for (;;) {
5180                         if (i >= size) {
5181                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5182                                 regs[rd] = 0;
5183                                 break;
5184                         }
5185
5186                         if ((d[i++] = dtrace_load8(s1++)) == '\0') {
5187                                 i--;
5188                                 break;
5189                         }
5190                 }
5191
5192                 for (;;) {
5193                         if (i >= size) {
5194                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5195                                 regs[rd] = 0;
5196                                 break;
5197                         }
5198
5199                         if ((d[i++] = dtrace_load8(s2++)) == '\0')
5200                                 break;
5201                 }
5202
5203                 if (i < size) {
5204                         mstate->dtms_scratch_ptr += i;
5205                         regs[rd] = (uintptr_t)d;
5206                 }
5207
5208                 break;
5209         }
5210
5211         case DIF_SUBR_STRTOLL: {
5212                 uintptr_t s = tupregs[0].dttk_value;
5213                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5214                 int base = 10;
5215
5216                 if (nargs > 1) {
5217                         if ((base = tupregs[1].dttk_value) <= 1 ||
5218                             base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5219                                 *flags |= CPU_DTRACE_ILLOP;
5220                                 break;
5221                         }
5222                 }
5223
5224                 if (!dtrace_strcanload(s, size, mstate, vstate)) {
5225                         regs[rd] = INT64_MIN;
5226                         break;
5227                 }
5228
5229                 regs[rd] = dtrace_strtoll((char *)s, base, size);
5230                 break;
5231         }
5232
5233         case DIF_SUBR_LLTOSTR: {
5234                 int64_t i = (int64_t)tupregs[0].dttk_value;
5235                 uint64_t val, digit;
5236                 uint64_t size = 65;     /* enough room for 2^64 in binary */
5237                 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
5238                 int base = 10;
5239
5240                 if (nargs > 1) {
5241                         if ((base = tupregs[1].dttk_value) <= 1 ||
5242                             base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
5243                                 *flags |= CPU_DTRACE_ILLOP;
5244                                 break;
5245                         }
5246                 }
5247
5248                 val = (base == 10 && i < 0) ? i * -1 : i;
5249
5250                 if (!DTRACE_INSCRATCH(mstate, size)) {
5251                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5252                         regs[rd] = 0;
5253                         break;
5254                 }
5255
5256                 for (*end-- = '\0'; val; val /= base) {
5257                         if ((digit = val % base) <= '9' - '0') {
5258                                 *end-- = '0' + digit;
5259                         } else {
5260                                 *end-- = 'a' + (digit - ('9' - '0') - 1);
5261                         }
5262                 }
5263
5264                 if (i == 0 && base == 16)
5265                         *end-- = '0';
5266
5267                 if (base == 16)
5268                         *end-- = 'x';
5269
5270                 if (i == 0 || base == 8 || base == 16)
5271                         *end-- = '0';
5272
5273                 if (i < 0 && base == 10)
5274                         *end-- = '-';
5275
5276                 regs[rd] = (uintptr_t)end + 1;
5277                 mstate->dtms_scratch_ptr += size;
5278                 break;
5279         }
5280
5281         case DIF_SUBR_HTONS:
5282         case DIF_SUBR_NTOHS:
5283 #if BYTE_ORDER == BIG_ENDIAN
5284                 regs[rd] = (uint16_t)tupregs[0].dttk_value;
5285 #else
5286                 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5287 #endif
5288                 break;
5289
5290
5291         case DIF_SUBR_HTONL:
5292         case DIF_SUBR_NTOHL:
5293 #if BYTE_ORDER == BIG_ENDIAN
5294                 regs[rd] = (uint32_t)tupregs[0].dttk_value;
5295 #else
5296                 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5297 #endif
5298                 break;
5299
5300
5301         case DIF_SUBR_HTONLL:
5302         case DIF_SUBR_NTOHLL:
5303 #if BYTE_ORDER == BIG_ENDIAN
5304                 regs[rd] = (uint64_t)tupregs[0].dttk_value;
5305 #else
5306                 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5307 #endif
5308                 break;
5309
5310
5311         case DIF_SUBR_DIRNAME:
5312         case DIF_SUBR_BASENAME: {
5313                 char *dest = (char *)mstate->dtms_scratch_ptr;
5314                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5315                 uintptr_t src = tupregs[0].dttk_value;
5316                 int i, j, len = dtrace_strlen((char *)src, size);
5317                 int lastbase = -1, firstbase = -1, lastdir = -1;
5318                 int start, end;
5319
5320                 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5321                         regs[rd] = 0;
5322                         break;
5323                 }
5324
5325                 if (!DTRACE_INSCRATCH(mstate, size)) {
5326                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5327                         regs[rd] = 0;
5328                         break;
5329                 }
5330
5331                 /*
5332                  * The basename and dirname for a zero-length string is
5333                  * defined to be "."
5334                  */
5335                 if (len == 0) {
5336                         len = 1;
5337                         src = (uintptr_t)".";
5338                 }
5339
5340                 /*
5341                  * Start from the back of the string, moving back toward the
5342                  * front until we see a character that isn't a slash.  That
5343                  * character is the last character in the basename.
5344                  */
5345                 for (i = len - 1; i >= 0; i--) {
5346                         if (dtrace_load8(src + i) != '/')
5347                                 break;
5348                 }
5349
5350                 if (i >= 0)
5351                         lastbase = i;
5352
5353                 /*
5354                  * Starting from the last character in the basename, move
5355                  * towards the front until we find a slash.  The character
5356                  * that we processed immediately before that is the first
5357                  * character in the basename.
5358                  */
5359                 for (; i >= 0; i--) {
5360                         if (dtrace_load8(src + i) == '/')
5361                                 break;
5362                 }
5363
5364                 if (i >= 0)
5365                         firstbase = i + 1;
5366
5367                 /*
5368                  * Now keep going until we find a non-slash character.  That
5369                  * character is the last character in the dirname.
5370                  */
5371                 for (; i >= 0; i--) {
5372                         if (dtrace_load8(src + i) != '/')
5373                                 break;
5374                 }
5375
5376                 if (i >= 0)
5377                         lastdir = i;
5378
5379                 ASSERT(!(lastbase == -1 && firstbase != -1));
5380                 ASSERT(!(firstbase == -1 && lastdir != -1));
5381
5382                 if (lastbase == -1) {
5383                         /*
5384                          * We didn't find a non-slash character.  We know that
5385                          * the length is non-zero, so the whole string must be
5386                          * slashes.  In either the dirname or the basename
5387                          * case, we return '/'.
5388                          */
5389                         ASSERT(firstbase == -1);
5390                         firstbase = lastbase = lastdir = 0;
5391                 }
5392
5393                 if (firstbase == -1) {
5394                         /*
5395                          * The entire string consists only of a basename
5396                          * component.  If we're looking for dirname, we need
5397                          * to change our string to be just "."; if we're
5398                          * looking for a basename, we'll just set the first
5399                          * character of the basename to be 0.
5400                          */
5401                         if (subr == DIF_SUBR_DIRNAME) {
5402                                 ASSERT(lastdir == -1);
5403                                 src = (uintptr_t)".";
5404                                 lastdir = 0;
5405                         } else {
5406                                 firstbase = 0;
5407                         }
5408                 }
5409
5410                 if (subr == DIF_SUBR_DIRNAME) {
5411                         if (lastdir == -1) {
5412                                 /*
5413                                  * We know that we have a slash in the name --
5414                                  * or lastdir would be set to 0, above.  And
5415                                  * because lastdir is -1, we know that this
5416                                  * slash must be the first character.  (That
5417                                  * is, the full string must be of the form
5418                                  * "/basename".)  In this case, the last
5419                                  * character of the directory name is 0.
5420                                  */
5421                                 lastdir = 0;
5422                         }
5423
5424                         start = 0;
5425                         end = lastdir;
5426                 } else {
5427                         ASSERT(subr == DIF_SUBR_BASENAME);
5428                         ASSERT(firstbase != -1 && lastbase != -1);
5429                         start = firstbase;
5430                         end = lastbase;
5431                 }
5432
5433                 for (i = start, j = 0; i <= end && j < size - 1; i++, j++)
5434                         dest[j] = dtrace_load8(src + i);
5435
5436                 dest[j] = '\0';
5437                 regs[rd] = (uintptr_t)dest;
5438                 mstate->dtms_scratch_ptr += size;
5439                 break;
5440         }
5441
5442         case DIF_SUBR_GETF: {
5443                 uintptr_t fd = tupregs[0].dttk_value;
5444                 struct filedesc *fdp;
5445                 file_t *fp;
5446
5447                 if (!dtrace_priv_proc(state)) {
5448                         regs[rd] = 0;
5449                         break;
5450                 }
5451                 fdp = curproc->p_fd;
5452                 FILEDESC_SLOCK(fdp);
5453                 fp = fget_locked(fdp, fd);
5454                 mstate->dtms_getf = fp;
5455                 regs[rd] = (uintptr_t)fp;
5456                 FILEDESC_SUNLOCK(fdp);
5457                 break;
5458         }
5459
5460         case DIF_SUBR_CLEANPATH: {
5461                 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5462                 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5463                 uintptr_t src = tupregs[0].dttk_value;
5464                 int i = 0, j = 0;
5465 #if defined(sun)
5466                 zone_t *z;
5467 #endif
5468
5469                 if (!dtrace_strcanload(src, size, mstate, vstate)) {
5470                         regs[rd] = 0;
5471                         break;
5472                 }
5473
5474                 if (!DTRACE_INSCRATCH(mstate, size)) {
5475                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5476                         regs[rd] = 0;
5477                         break;
5478                 }
5479
5480                 /*
5481                  * Move forward, loading each character.
5482                  */
5483                 do {
5484                         c = dtrace_load8(src + i++);
5485 next:
5486                         if (j + 5 >= size)      /* 5 = strlen("/..c\0") */
5487                                 break;
5488
5489                         if (c != '/') {
5490                                 dest[j++] = c;
5491                                 continue;
5492                         }
5493
5494                         c = dtrace_load8(src + i++);
5495
5496                         if (c == '/') {
5497                                 /*
5498                                  * We have two slashes -- we can just advance
5499                                  * to the next character.
5500                                  */
5501                                 goto next;
5502                         }
5503
5504                         if (c != '.') {
5505                                 /*
5506                                  * This is not "." and it's not ".." -- we can
5507                                  * just store the "/" and this character and
5508                                  * drive on.
5509                                  */
5510                                 dest[j++] = '/';
5511                                 dest[j++] = c;
5512                                 continue;
5513                         }
5514
5515                         c = dtrace_load8(src + i++);
5516
5517                         if (c == '/') {
5518                                 /*
5519                                  * This is a "/./" component.  We're not going
5520                                  * to store anything in the destination buffer;
5521                                  * we're just going to go to the next component.
5522                                  */
5523                                 goto next;
5524                         }
5525
5526                         if (c != '.') {
5527                                 /*
5528                                  * This is not ".." -- we can just store the
5529                                  * "/." and this character and continue
5530                                  * processing.
5531                                  */
5532                                 dest[j++] = '/';
5533                                 dest[j++] = '.';
5534                                 dest[j++] = c;
5535                                 continue;
5536                         }
5537
5538                         c = dtrace_load8(src + i++);
5539
5540                         if (c != '/' && c != '\0') {
5541                                 /*
5542                                  * This is not ".." -- it's "..[mumble]".
5543                                  * We'll store the "/.." and this character
5544                                  * and continue processing.
5545                                  */
5546                                 dest[j++] = '/';
5547                                 dest[j++] = '.';
5548                                 dest[j++] = '.';
5549                                 dest[j++] = c;
5550                                 continue;
5551                         }
5552
5553                         /*
5554                          * This is "/../" or "/..\0".  We need to back up
5555                          * our destination pointer until we find a "/".
5556                          */
5557                         i--;
5558                         while (j != 0 && dest[--j] != '/')
5559                                 continue;
5560
5561                         if (c == '\0')
5562                                 dest[++j] = '/';
5563                 } while (c != '\0');
5564
5565                 dest[j] = '\0';
5566
5567 #if defined(sun)
5568                 if (mstate->dtms_getf != NULL &&
5569                     !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
5570                     (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
5571                         /*
5572                          * If we've done a getf() as a part of this ECB and we
5573                          * don't have kernel access (and we're not in the global
5574                          * zone), check if the path we cleaned up begins with
5575                          * the zone's root path, and trim it off if so.  Note
5576                          * that this is an output cleanliness issue, not a
5577                          * security issue: knowing one's zone root path does
5578                          * not enable privilege escalation.
5579                          */
5580                         if (strstr(dest, z->zone_rootpath) == dest)
5581                                 dest += strlen(z->zone_rootpath) - 1;
5582                 }
5583 #endif
5584
5585                 regs[rd] = (uintptr_t)dest;
5586                 mstate->dtms_scratch_ptr += size;
5587                 break;
5588         }
5589
5590         case DIF_SUBR_INET_NTOA:
5591         case DIF_SUBR_INET_NTOA6:
5592         case DIF_SUBR_INET_NTOP: {
5593                 size_t size;
5594                 int af, argi, i;
5595                 char *base, *end;
5596
5597                 if (subr == DIF_SUBR_INET_NTOP) {
5598                         af = (int)tupregs[0].dttk_value;
5599                         argi = 1;
5600                 } else {
5601                         af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5602                         argi = 0;
5603                 }
5604
5605                 if (af == AF_INET) {
5606                         ipaddr_t ip4;
5607                         uint8_t *ptr8, val;
5608
5609                         /*
5610                          * Safely load the IPv4 address.
5611                          */
5612                         ip4 = dtrace_load32(tupregs[argi].dttk_value);
5613
5614                         /*
5615                          * Check an IPv4 string will fit in scratch.
5616                          */
5617                         size = INET_ADDRSTRLEN;
5618                         if (!DTRACE_INSCRATCH(mstate, size)) {
5619                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5620                                 regs[rd] = 0;
5621                                 break;
5622                         }
5623                         base = (char *)mstate->dtms_scratch_ptr;
5624                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
5625
5626                         /*
5627                          * Stringify as a dotted decimal quad.
5628                          */
5629                         *end-- = '\0';
5630                         ptr8 = (uint8_t *)&ip4;
5631                         for (i = 3; i >= 0; i--) {
5632                                 val = ptr8[i];
5633
5634                                 if (val == 0) {
5635                                         *end-- = '0';
5636                                 } else {
5637                                         for (; val; val /= 10) {
5638                                                 *end-- = '0' + (val % 10);
5639                                         }
5640                                 }
5641
5642                                 if (i > 0)
5643                                         *end-- = '.';
5644                         }
5645                         ASSERT(end + 1 >= base);
5646
5647                 } else if (af == AF_INET6) {
5648                         struct in6_addr ip6;
5649                         int firstzero, tryzero, numzero, v6end;
5650                         uint16_t val;
5651                         const char digits[] = "0123456789abcdef";
5652
5653                         /*
5654                          * Stringify using RFC 1884 convention 2 - 16 bit
5655                          * hexadecimal values with a zero-run compression.
5656                          * Lower case hexadecimal digits are used.
5657                          *      eg, fe80::214:4fff:fe0b:76c8.
5658                          * The IPv4 embedded form is returned for inet_ntop,
5659                          * just the IPv4 string is returned for inet_ntoa6.
5660                          */
5661
5662                         /*
5663                          * Safely load the IPv6 address.
5664                          */
5665                         dtrace_bcopy(
5666                             (void *)(uintptr_t)tupregs[argi].dttk_value,
5667                             (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5668
5669                         /*
5670                          * Check an IPv6 string will fit in scratch.
5671                          */
5672                         size = INET6_ADDRSTRLEN;
5673                         if (!DTRACE_INSCRATCH(mstate, size)) {
5674                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5675                                 regs[rd] = 0;
5676                                 break;
5677                         }
5678                         base = (char *)mstate->dtms_scratch_ptr;
5679                         end = (char *)mstate->dtms_scratch_ptr + size - 1;
5680                         *end-- = '\0';
5681
5682                         /*
5683                          * Find the longest run of 16 bit zero values
5684                          * for the single allowed zero compression - "::".
5685                          */
5686                         firstzero = -1;
5687                         tryzero = -1;
5688                         numzero = 1;
5689                         for (i = 0; i < sizeof (struct in6_addr); i++) {
5690 #if defined(sun)
5691                                 if (ip6._S6_un._S6_u8[i] == 0 &&
5692 #else
5693                                 if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5694 #endif
5695                                     tryzero == -1 && i % 2 == 0) {
5696                                         tryzero = i;
5697                                         continue;
5698                                 }
5699
5700                                 if (tryzero != -1 &&
5701 #if defined(sun)
5702                                     (ip6._S6_un._S6_u8[i] != 0 ||
5703 #else
5704                                     (ip6.__u6_addr.__u6_addr8[i] != 0 ||
5705 #endif
5706                                     i == sizeof (struct in6_addr) - 1)) {
5707
5708                                         if (i - tryzero <= numzero) {
5709                                                 tryzero = -1;
5710                                                 continue;
5711                                         }
5712
5713                                         firstzero = tryzero;
5714                                         numzero = i - i % 2 - tryzero;
5715                                         tryzero = -1;
5716
5717 #if defined(sun)
5718                                         if (ip6._S6_un._S6_u8[i] == 0 &&
5719 #else
5720                                         if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
5721 #endif
5722                                             i == sizeof (struct in6_addr) - 1)
5723                                                 numzero += 2;
5724                                 }
5725                         }
5726                         ASSERT(firstzero + numzero <= sizeof (struct in6_addr));
5727
5728                         /*
5729                          * Check for an IPv4 embedded address.
5730                          */
5731                         v6end = sizeof (struct in6_addr) - 2;
5732                         if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5733                             IN6_IS_ADDR_V4COMPAT(&ip6)) {
5734                                 for (i = sizeof (struct in6_addr) - 1;
5735                                     i >= DTRACE_V4MAPPED_OFFSET; i--) {
5736                                         ASSERT(end >= base);
5737
5738 #if defined(sun)
5739                                         val = ip6._S6_un._S6_u8[i];
5740 #else
5741                                         val = ip6.__u6_addr.__u6_addr8[i];
5742 #endif
5743
5744                                         if (val == 0) {
5745                                                 *end-- = '0';
5746                                         } else {
5747                                                 for (; val; val /= 10) {
5748                                                         *end-- = '0' + val % 10;
5749                                                 }
5750                                         }
5751
5752                                         if (i > DTRACE_V4MAPPED_OFFSET)
5753                                                 *end-- = '.';
5754                                 }
5755
5756                                 if (subr == DIF_SUBR_INET_NTOA6)
5757                                         goto inetout;
5758
5759                                 /*
5760                                  * Set v6end to skip the IPv4 address that
5761                                  * we have already stringified.
5762                                  */
5763                                 v6end = 10;
5764                         }
5765
5766                         /*
5767                          * Build the IPv6 string by working through the
5768                          * address in reverse.
5769                          */
5770                         for (i = v6end; i >= 0; i -= 2) {
5771                                 ASSERT(end >= base);
5772
5773                                 if (i == firstzero + numzero - 2) {
5774                                         *end-- = ':';
5775                                         *end-- = ':';
5776                                         i -= numzero - 2;
5777                                         continue;
5778                                 }
5779
5780                                 if (i < 14 && i != firstzero - 2)
5781                                         *end-- = ':';
5782
5783 #if defined(sun)
5784                                 val = (ip6._S6_un._S6_u8[i] << 8) +
5785                                     ip6._S6_un._S6_u8[i + 1];
5786 #else
5787                                 val = (ip6.__u6_addr.__u6_addr8[i] << 8) +
5788                                     ip6.__u6_addr.__u6_addr8[i + 1];
5789 #endif
5790
5791                                 if (val == 0) {
5792                                         *end-- = '0';
5793                                 } else {
5794                                         for (; val; val /= 16) {
5795                                                 *end-- = digits[val % 16];
5796                                         }
5797                                 }
5798                         }
5799                         ASSERT(end + 1 >= base);
5800
5801                 } else {
5802                         /*
5803                          * The user didn't use AH_INET or AH_INET6.
5804                          */
5805                         DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5806                         regs[rd] = 0;
5807                         break;
5808                 }
5809
5810 inetout:        regs[rd] = (uintptr_t)end + 1;
5811                 mstate->dtms_scratch_ptr += size;
5812                 break;
5813         }
5814
5815         case DIF_SUBR_MEMREF: {
5816                 uintptr_t size = 2 * sizeof(uintptr_t);
5817                 uintptr_t *memref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5818                 size_t scratch_size = ((uintptr_t) memref - mstate->dtms_scratch_ptr) + size;
5819
5820                 /* address and length */
5821                 memref[0] = tupregs[0].dttk_value;
5822                 memref[1] = tupregs[1].dttk_value;
5823
5824                 regs[rd] = (uintptr_t) memref;
5825                 mstate->dtms_scratch_ptr += scratch_size;
5826                 break;
5827         }
5828
5829 #if !defined(sun)
5830         case DIF_SUBR_MEMSTR: {
5831                 char *str = (char *)mstate->dtms_scratch_ptr;
5832                 uintptr_t mem = tupregs[0].dttk_value;
5833                 char c = tupregs[1].dttk_value;
5834                 size_t size = tupregs[2].dttk_value;
5835                 uint8_t n;
5836                 int i;
5837
5838                 regs[rd] = 0;
5839
5840                 if (size == 0)
5841                         break;
5842
5843                 if (!dtrace_canload(mem, size - 1, mstate, vstate))
5844                         break;
5845
5846                 if (!DTRACE_INSCRATCH(mstate, size)) {
5847                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5848                         break;
5849                 }
5850
5851                 if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
5852                         *flags |= CPU_DTRACE_ILLOP;
5853                         break;
5854                 }
5855
5856                 for (i = 0; i < size - 1; i++) {
5857                         n = dtrace_load8(mem++);
5858                         str[i] = (n == 0) ? c : n;
5859                 }
5860                 str[size - 1] = 0;
5861
5862                 regs[rd] = (uintptr_t)str;
5863                 mstate->dtms_scratch_ptr += size;
5864                 break;
5865         }
5866 #endif
5867
5868         case DIF_SUBR_TYPEREF: {
5869                 uintptr_t size = 4 * sizeof(uintptr_t);
5870                 uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
5871                 size_t scratch_size = ((uintptr_t) typeref - mstate->dtms_scratch_ptr) + size;
5872
5873                 /* address, num_elements, type_str, type_len */
5874                 typeref[0] = tupregs[0].dttk_value;
5875                 typeref[1] = tupregs[1].dttk_value;
5876                 typeref[2] = tupregs[2].dttk_value;
5877                 typeref[3] = tupregs[3].dttk_value;
5878
5879                 regs[rd] = (uintptr_t) typeref;
5880                 mstate->dtms_scratch_ptr += scratch_size;
5881                 break;
5882         }
5883         }
5884 }
5885
5886 /*
5887  * Emulate the execution of DTrace IR instructions specified by the given
5888  * DIF object.  This function is deliberately void of assertions as all of
5889  * the necessary checks are handled by a call to dtrace_difo_validate().
5890  */
5891 static uint64_t
5892 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5893     dtrace_vstate_t *vstate, dtrace_state_t *state)
5894 {
5895         const dif_instr_t *text = difo->dtdo_buf;
5896         const uint_t textlen = difo->dtdo_len;
5897         const char *strtab = difo->dtdo_strtab;
5898         const uint64_t *inttab = difo->dtdo_inttab;
5899
5900         uint64_t rval = 0;
5901         dtrace_statvar_t *svar;
5902         dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5903         dtrace_difv_t *v;
5904         volatile uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
5905         volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
5906
5907         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5908         uint64_t regs[DIF_DIR_NREGS];
5909         uint64_t *tmp;
5910
5911         uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5912         int64_t cc_r;
5913         uint_t pc = 0, id, opc = 0;
5914         uint8_t ttop = 0;
5915         dif_instr_t instr;
5916         uint_t r1, r2, rd;
5917
5918         /*
5919          * We stash the current DIF object into the machine state: we need it
5920          * for subsequent access checking.
5921          */
5922         mstate->dtms_difo = difo;
5923
5924         regs[DIF_REG_R0] = 0;           /* %r0 is fixed at zero */
5925
5926         while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5927                 opc = pc;
5928
5929                 instr = text[pc++];
5930                 r1 = DIF_INSTR_R1(instr);
5931                 r2 = DIF_INSTR_R2(instr);
5932                 rd = DIF_INSTR_RD(instr);
5933
5934                 switch (DIF_INSTR_OP(instr)) {
5935                 case DIF_OP_OR:
5936                         regs[rd] = regs[r1] | regs[r2];
5937                         break;
5938                 case DIF_OP_XOR:
5939                         regs[rd] = regs[r1] ^ regs[r2];
5940                         break;
5941                 case DIF_OP_AND:
5942                         regs[rd] = regs[r1] & regs[r2];
5943                         break;
5944                 case DIF_OP_SLL:
5945                         regs[rd] = regs[r1] << regs[r2];
5946                         break;
5947                 case DIF_OP_SRL:
5948                         regs[rd] = regs[r1] >> regs[r2];
5949                         break;
5950                 case DIF_OP_SUB:
5951                         regs[rd] = regs[r1] - regs[r2];
5952                         break;
5953                 case DIF_OP_ADD:
5954                         regs[rd] = regs[r1] + regs[r2];
5955                         break;
5956                 case DIF_OP_MUL:
5957                         regs[rd] = regs[r1] * regs[r2];
5958                         break;
5959                 case DIF_OP_SDIV:
5960                         if (regs[r2] == 0) {
5961                                 regs[rd] = 0;
5962                                 *flags |= CPU_DTRACE_DIVZERO;
5963                         } else {
5964                                 regs[rd] = (int64_t)regs[r1] /
5965                                     (int64_t)regs[r2];
5966                         }
5967                         break;
5968
5969                 case DIF_OP_UDIV:
5970                         if (regs[r2] == 0) {
5971                                 regs[rd] = 0;
5972                                 *flags |= CPU_DTRACE_DIVZERO;
5973                         } else {
5974                                 regs[rd] = regs[r1] / regs[r2];
5975                         }
5976                         break;
5977
5978                 case DIF_OP_SREM:
5979                         if (regs[r2] == 0) {
5980                                 regs[rd] = 0;
5981                                 *flags |= CPU_DTRACE_DIVZERO;
5982                         } else {
5983                                 regs[rd] = (int64_t)regs[r1] %
5984                                     (int64_t)regs[r2];
5985                         }
5986                         break;
5987
5988                 case DIF_OP_UREM:
5989                         if (regs[r2] == 0) {
5990                                 regs[rd] = 0;
5991                                 *flags |= CPU_DTRACE_DIVZERO;
5992                         } else {
5993                                 regs[rd] = regs[r1] % regs[r2];
5994                         }
5995                         break;
5996
5997                 case DIF_OP_NOT:
5998                         regs[rd] = ~regs[r1];
5999                         break;
6000                 case DIF_OP_MOV:
6001                         regs[rd] = regs[r1];
6002                         break;
6003                 case DIF_OP_CMP:
6004                         cc_r = regs[r1] - regs[r2];
6005                         cc_n = cc_r < 0;
6006                         cc_z = cc_r == 0;
6007                         cc_v = 0;
6008                         cc_c = regs[r1] < regs[r2];
6009                         break;
6010                 case DIF_OP_TST:
6011                         cc_n = cc_v = cc_c = 0;
6012                         cc_z = regs[r1] == 0;
6013                         break;
6014                 case DIF_OP_BA:
6015                         pc = DIF_INSTR_LABEL(instr);
6016                         break;
6017                 case DIF_OP_BE:
6018                         if (cc_z)
6019                                 pc = DIF_INSTR_LABEL(instr);
6020                         break;
6021                 case DIF_OP_BNE:
6022                         if (cc_z == 0)
6023                                 pc = DIF_INSTR_LABEL(instr);
6024                         break;
6025                 case DIF_OP_BG:
6026                         if ((cc_z | (cc_n ^ cc_v)) == 0)
6027                                 pc = DIF_INSTR_LABEL(instr);
6028                         break;
6029                 case DIF_OP_BGU:
6030                         if ((cc_c | cc_z) == 0)
6031                                 pc = DIF_INSTR_LABEL(instr);
6032                         break;
6033                 case DIF_OP_BGE:
6034                         if ((cc_n ^ cc_v) == 0)
6035                                 pc = DIF_INSTR_LABEL(instr);
6036                         break;
6037                 case DIF_OP_BGEU:
6038                         if (cc_c == 0)
6039                                 pc = DIF_INSTR_LABEL(instr);
6040                         break;
6041                 case DIF_OP_BL:
6042                         if (cc_n ^ cc_v)
6043                                 pc = DIF_INSTR_LABEL(instr);
6044                         break;
6045                 case DIF_OP_BLU:
6046                         if (cc_c)
6047                                 pc = DIF_INSTR_LABEL(instr);
6048                         break;
6049                 case DIF_OP_BLE:
6050                         if (cc_z | (cc_n ^ cc_v))
6051                                 pc = DIF_INSTR_LABEL(instr);
6052                         break;
6053                 case DIF_OP_BLEU:
6054                         if (cc_c | cc_z)
6055                                 pc = DIF_INSTR_LABEL(instr);
6056                         break;
6057                 case DIF_OP_RLDSB:
6058                         if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6059                                 break;
6060                         /*FALLTHROUGH*/
6061                 case DIF_OP_LDSB:
6062                         regs[rd] = (int8_t)dtrace_load8(regs[r1]);
6063                         break;
6064                 case DIF_OP_RLDSH:
6065                         if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6066                                 break;
6067                         /*FALLTHROUGH*/
6068                 case DIF_OP_LDSH:
6069                         regs[rd] = (int16_t)dtrace_load16(regs[r1]);
6070                         break;
6071                 case DIF_OP_RLDSW:
6072                         if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6073                                 break;
6074                         /*FALLTHROUGH*/
6075                 case DIF_OP_LDSW:
6076                         regs[rd] = (int32_t)dtrace_load32(regs[r1]);
6077                         break;
6078                 case DIF_OP_RLDUB:
6079                         if (!dtrace_canload(regs[r1], 1, mstate, vstate))
6080                                 break;
6081                         /*FALLTHROUGH*/
6082                 case DIF_OP_LDUB:
6083                         regs[rd] = dtrace_load8(regs[r1]);
6084                         break;
6085                 case DIF_OP_RLDUH:
6086                         if (!dtrace_canload(regs[r1], 2, mstate, vstate))
6087                                 break;
6088                         /*FALLTHROUGH*/
6089                 case DIF_OP_LDUH:
6090                         regs[rd] = dtrace_load16(regs[r1]);
6091                         break;
6092                 case DIF_OP_RLDUW:
6093                         if (!dtrace_canload(regs[r1], 4, mstate, vstate))
6094                                 break;
6095                         /*FALLTHROUGH*/
6096                 case DIF_OP_LDUW:
6097                         regs[rd] = dtrace_load32(regs[r1]);
6098                         break;
6099                 case DIF_OP_RLDX:
6100                         if (!dtrace_canload(regs[r1], 8, mstate, vstate))
6101                                 break;
6102                         /*FALLTHROUGH*/
6103                 case DIF_OP_LDX:
6104                         regs[rd] = dtrace_load64(regs[r1]);
6105                         break;
6106                 case DIF_OP_ULDSB:
6107                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6108                         regs[rd] = (int8_t)
6109                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6110                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6111                         break;
6112                 case DIF_OP_ULDSH:
6113                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6114                         regs[rd] = (int16_t)
6115                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6116                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6117                         break;
6118                 case DIF_OP_ULDSW:
6119                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6120                         regs[rd] = (int32_t)
6121                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6122                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6123                         break;
6124                 case DIF_OP_ULDUB:
6125                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6126                         regs[rd] =
6127                             dtrace_fuword8((void *)(uintptr_t)regs[r1]);
6128                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6129                         break;
6130                 case DIF_OP_ULDUH:
6131                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6132                         regs[rd] =
6133                             dtrace_fuword16((void *)(uintptr_t)regs[r1]);
6134                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6135                         break;
6136                 case DIF_OP_ULDUW:
6137                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6138                         regs[rd] =
6139                             dtrace_fuword32((void *)(uintptr_t)regs[r1]);
6140                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6141                         break;
6142                 case DIF_OP_ULDX:
6143                         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6144                         regs[rd] =
6145                             dtrace_fuword64((void *)(uintptr_t)regs[r1]);
6146                         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6147                         break;
6148                 case DIF_OP_RET:
6149                         rval = regs[rd];
6150                         pc = textlen;
6151                         break;
6152                 case DIF_OP_NOP:
6153                         break;
6154                 case DIF_OP_SETX:
6155                         regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6156                         break;
6157                 case DIF_OP_SETS:
6158                         regs[rd] = (uint64_t)(uintptr_t)
6159                             (strtab + DIF_INSTR_STRING(instr));
6160                         break;
6161                 case DIF_OP_SCMP: {
6162                         size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6163                         uintptr_t s1 = regs[r1];
6164                         uintptr_t s2 = regs[r2];
6165
6166                         if (s1 != 0 &&
6167                             !dtrace_strcanload(s1, sz, mstate, vstate))
6168                                 break;
6169                         if (s2 != 0 &&
6170                             !dtrace_strcanload(s2, sz, mstate, vstate))
6171                                 break;
6172
6173                         cc_r = dtrace_strncmp((char *)s1, (char *)s2, sz);
6174
6175                         cc_n = cc_r < 0;
6176                         cc_z = cc_r == 0;
6177                         cc_v = cc_c = 0;
6178                         break;
6179                 }
6180                 case DIF_OP_LDGA:
6181                         regs[rd] = dtrace_dif_variable(mstate, state,
6182                             r1, regs[r2]);
6183                         break;
6184                 case DIF_OP_LDGS:
6185                         id = DIF_INSTR_VAR(instr);
6186
6187                         if (id >= DIF_VAR_OTHER_UBASE) {
6188                                 uintptr_t a;
6189
6190                                 id -= DIF_VAR_OTHER_UBASE;
6191                                 svar = vstate->dtvs_globals[id];
6192                                 ASSERT(svar != NULL);
6193                                 v = &svar->dtsv_var;
6194
6195                                 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6196                                         regs[rd] = svar->dtsv_data;
6197                                         break;
6198                                 }
6199
6200                                 a = (uintptr_t)svar->dtsv_data;
6201
6202                                 if (*(uint8_t *)a == UINT8_MAX) {
6203                                         /*
6204                                          * If the 0th byte is set to UINT8_MAX
6205                                          * then this is to be treated as a
6206                                          * reference to a NULL variable.
6207                                          */
6208                                         regs[rd] = 0;
6209                                 } else {
6210                                         regs[rd] = a + sizeof (uint64_t);
6211                                 }
6212
6213                                 break;
6214                         }
6215
6216                         regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6217                         break;
6218
6219                 case DIF_OP_STGS:
6220                         id = DIF_INSTR_VAR(instr);
6221
6222                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6223                         id -= DIF_VAR_OTHER_UBASE;
6224
6225                         svar = vstate->dtvs_globals[id];
6226                         ASSERT(svar != NULL);
6227                         v = &svar->dtsv_var;
6228
6229                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6230                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6231
6232                                 ASSERT(a != 0);
6233                                 ASSERT(svar->dtsv_size != 0);
6234
6235                                 if (regs[rd] == 0) {
6236                                         *(uint8_t *)a = UINT8_MAX;
6237                                         break;
6238                                 } else {
6239                                         *(uint8_t *)a = 0;
6240                                         a += sizeof (uint64_t);
6241                                 }
6242                                 if (!dtrace_vcanload(
6243                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6244                                     mstate, vstate))
6245                                         break;
6246
6247                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6248                                     (void *)a, &v->dtdv_type);
6249                                 break;
6250                         }
6251
6252                         svar->dtsv_data = regs[rd];
6253                         break;
6254
6255                 case DIF_OP_LDTA:
6256                         /*
6257                          * There are no DTrace built-in thread-local arrays at
6258                          * present.  This opcode is saved for future work.
6259                          */
6260                         *flags |= CPU_DTRACE_ILLOP;
6261                         regs[rd] = 0;
6262                         break;
6263
6264                 case DIF_OP_LDLS:
6265                         id = DIF_INSTR_VAR(instr);
6266
6267                         if (id < DIF_VAR_OTHER_UBASE) {
6268                                 /*
6269                                  * For now, this has no meaning.
6270                                  */
6271                                 regs[rd] = 0;
6272                                 break;
6273                         }
6274
6275                         id -= DIF_VAR_OTHER_UBASE;
6276
6277                         ASSERT(id < vstate->dtvs_nlocals);
6278                         ASSERT(vstate->dtvs_locals != NULL);
6279
6280                         svar = vstate->dtvs_locals[id];
6281                         ASSERT(svar != NULL);
6282                         v = &svar->dtsv_var;
6283
6284                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6285                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6286                                 size_t sz = v->dtdv_type.dtdt_size;
6287
6288                                 sz += sizeof (uint64_t);
6289                                 ASSERT(svar->dtsv_size == NCPU * sz);
6290                                 a += curcpu * sz;
6291
6292                                 if (*(uint8_t *)a == UINT8_MAX) {
6293                                         /*
6294                                          * If the 0th byte is set to UINT8_MAX
6295                                          * then this is to be treated as a
6296                                          * reference to a NULL variable.
6297                                          */
6298                                         regs[rd] = 0;
6299                                 } else {
6300                                         regs[rd] = a + sizeof (uint64_t);
6301                                 }
6302
6303                                 break;
6304                         }
6305
6306                         ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6307                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6308                         regs[rd] = tmp[curcpu];
6309                         break;
6310
6311                 case DIF_OP_STLS:
6312                         id = DIF_INSTR_VAR(instr);
6313
6314                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6315                         id -= DIF_VAR_OTHER_UBASE;
6316                         ASSERT(id < vstate->dtvs_nlocals);
6317
6318                         ASSERT(vstate->dtvs_locals != NULL);
6319                         svar = vstate->dtvs_locals[id];
6320                         ASSERT(svar != NULL);
6321                         v = &svar->dtsv_var;
6322
6323                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6324                                 uintptr_t a = (uintptr_t)svar->dtsv_data;
6325                                 size_t sz = v->dtdv_type.dtdt_size;
6326
6327                                 sz += sizeof (uint64_t);
6328                                 ASSERT(svar->dtsv_size == NCPU * sz);
6329                                 a += curcpu * sz;
6330
6331                                 if (regs[rd] == 0) {
6332                                         *(uint8_t *)a = UINT8_MAX;
6333                                         break;
6334                                 } else {
6335                                         *(uint8_t *)a = 0;
6336                                         a += sizeof (uint64_t);
6337                                 }
6338
6339                                 if (!dtrace_vcanload(
6340                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6341                                     mstate, vstate))
6342                                         break;
6343
6344                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6345                                     (void *)a, &v->dtdv_type);
6346                                 break;
6347                         }
6348
6349                         ASSERT(svar->dtsv_size == NCPU * sizeof (uint64_t));
6350                         tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6351                         tmp[curcpu] = regs[rd];
6352                         break;
6353
6354                 case DIF_OP_LDTS: {
6355                         dtrace_dynvar_t *dvar;
6356                         dtrace_key_t *key;
6357
6358                         id = DIF_INSTR_VAR(instr);
6359                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6360                         id -= DIF_VAR_OTHER_UBASE;
6361                         v = &vstate->dtvs_tlocals[id];
6362
6363                         key = &tupregs[DIF_DTR_NREGS];
6364                         key[0].dttk_value = (uint64_t)id;
6365                         key[0].dttk_size = 0;
6366                         DTRACE_TLS_THRKEY(key[1].dttk_value);
6367                         key[1].dttk_size = 0;
6368
6369                         dvar = dtrace_dynvar(dstate, 2, key,
6370                             sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6371                             mstate, vstate);
6372
6373                         if (dvar == NULL) {
6374                                 regs[rd] = 0;
6375                                 break;
6376                         }
6377
6378                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6379                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6380                         } else {
6381                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6382                         }
6383
6384                         break;
6385                 }
6386
6387                 case DIF_OP_STTS: {
6388                         dtrace_dynvar_t *dvar;
6389                         dtrace_key_t *key;
6390
6391                         id = DIF_INSTR_VAR(instr);
6392                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6393                         id -= DIF_VAR_OTHER_UBASE;
6394
6395                         key = &tupregs[DIF_DTR_NREGS];
6396                         key[0].dttk_value = (uint64_t)id;
6397                         key[0].dttk_size = 0;
6398                         DTRACE_TLS_THRKEY(key[1].dttk_value);
6399                         key[1].dttk_size = 0;
6400                         v = &vstate->dtvs_tlocals[id];
6401
6402                         dvar = dtrace_dynvar(dstate, 2, key,
6403                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6404                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6405                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
6406                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6407
6408                         /*
6409                          * Given that we're storing to thread-local data,
6410                          * we need to flush our predicate cache.
6411                          */
6412                         curthread->t_predcache = 0;
6413
6414                         if (dvar == NULL)
6415                                 break;
6416
6417                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6418                                 if (!dtrace_vcanload(
6419                                     (void *)(uintptr_t)regs[rd],
6420                                     &v->dtdv_type, mstate, vstate))
6421                                         break;
6422
6423                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6424                                     dvar->dtdv_data, &v->dtdv_type);
6425                         } else {
6426                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6427                         }
6428
6429                         break;
6430                 }
6431
6432                 case DIF_OP_SRA:
6433                         regs[rd] = (int64_t)regs[r1] >> regs[r2];
6434                         break;
6435
6436                 case DIF_OP_CALL:
6437                         dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6438                             regs, tupregs, ttop, mstate, state);
6439                         break;
6440
6441                 case DIF_OP_PUSHTR:
6442                         if (ttop == DIF_DTR_NREGS) {
6443                                 *flags |= CPU_DTRACE_TUPOFLOW;
6444                                 break;
6445                         }
6446
6447                         if (r1 == DIF_TYPE_STRING) {
6448                                 /*
6449                                  * If this is a string type and the size is 0,
6450                                  * we'll use the system-wide default string
6451                                  * size.  Note that we are _not_ looking at
6452                                  * the value of the DTRACEOPT_STRSIZE option;
6453                                  * had this been set, we would expect to have
6454                                  * a non-zero size value in the "pushtr".
6455                                  */
6456                                 tupregs[ttop].dttk_size =
6457                                     dtrace_strlen((char *)(uintptr_t)regs[rd],
6458                                     regs[r2] ? regs[r2] :
6459                                     dtrace_strsize_default) + 1;
6460                         } else {
6461                                 tupregs[ttop].dttk_size = regs[r2];
6462                         }
6463
6464                         tupregs[ttop++].dttk_value = regs[rd];
6465                         break;
6466
6467                 case DIF_OP_PUSHTV:
6468                         if (ttop == DIF_DTR_NREGS) {
6469                                 *flags |= CPU_DTRACE_TUPOFLOW;
6470                                 break;
6471                         }
6472
6473                         tupregs[ttop].dttk_value = regs[rd];
6474                         tupregs[ttop++].dttk_size = 0;
6475                         break;
6476
6477                 case DIF_OP_POPTS:
6478                         if (ttop != 0)
6479                                 ttop--;
6480                         break;
6481
6482                 case DIF_OP_FLUSHTS:
6483                         ttop = 0;
6484                         break;
6485
6486                 case DIF_OP_LDGAA:
6487                 case DIF_OP_LDTAA: {
6488                         dtrace_dynvar_t *dvar;
6489                         dtrace_key_t *key = tupregs;
6490                         uint_t nkeys = ttop;
6491
6492                         id = DIF_INSTR_VAR(instr);
6493                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6494                         id -= DIF_VAR_OTHER_UBASE;
6495
6496                         key[nkeys].dttk_value = (uint64_t)id;
6497                         key[nkeys++].dttk_size = 0;
6498
6499                         if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6500                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6501                                 key[nkeys++].dttk_size = 0;
6502                                 v = &vstate->dtvs_tlocals[id];
6503                         } else {
6504                                 v = &vstate->dtvs_globals[id]->dtsv_var;
6505                         }
6506
6507                         dvar = dtrace_dynvar(dstate, nkeys, key,
6508                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6509                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6510                             DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6511
6512                         if (dvar == NULL) {
6513                                 regs[rd] = 0;
6514                                 break;
6515                         }
6516
6517                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6518                                 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6519                         } else {
6520                                 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6521                         }
6522
6523                         break;
6524                 }
6525
6526                 case DIF_OP_STGAA:
6527                 case DIF_OP_STTAA: {
6528                         dtrace_dynvar_t *dvar;
6529                         dtrace_key_t *key = tupregs;
6530                         uint_t nkeys = ttop;
6531
6532                         id = DIF_INSTR_VAR(instr);
6533                         ASSERT(id >= DIF_VAR_OTHER_UBASE);
6534                         id -= DIF_VAR_OTHER_UBASE;
6535
6536                         key[nkeys].dttk_value = (uint64_t)id;
6537                         key[nkeys++].dttk_size = 0;
6538
6539                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6540                                 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6541                                 key[nkeys++].dttk_size = 0;
6542                                 v = &vstate->dtvs_tlocals[id];
6543                         } else {
6544                                 v = &vstate->dtvs_globals[id]->dtsv_var;
6545                         }
6546
6547                         dvar = dtrace_dynvar(dstate, nkeys, key,
6548                             v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6549                             v->dtdv_type.dtdt_size : sizeof (uint64_t),
6550                             regs[rd] ? DTRACE_DYNVAR_ALLOC :
6551                             DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6552
6553                         if (dvar == NULL)
6554                                 break;
6555
6556                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6557                                 if (!dtrace_vcanload(
6558                                     (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6559                                     mstate, vstate))
6560                                         break;
6561
6562                                 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6563                                     dvar->dtdv_data, &v->dtdv_type);
6564                         } else {
6565                                 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6566                         }
6567
6568                         break;
6569                 }
6570
6571                 case DIF_OP_ALLOCS: {
6572                         uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6573                         size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6574
6575                         /*
6576                          * Rounding up the user allocation size could have
6577                          * overflowed large, bogus allocations (like -1ULL) to
6578                          * 0.
6579                          */
6580                         if (size < regs[r1] ||
6581                             !DTRACE_INSCRATCH(mstate, size)) {
6582                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6583                                 regs[rd] = 0;
6584                                 break;
6585                         }
6586
6587                         dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6588                         mstate->dtms_scratch_ptr += size;
6589                         regs[rd] = ptr;
6590                         break;
6591                 }
6592
6593                 case DIF_OP_COPYS:
6594                         if (!dtrace_canstore(regs[rd], regs[r2],
6595                             mstate, vstate)) {
6596                                 *flags |= CPU_DTRACE_BADADDR;
6597                                 *illval = regs[rd];
6598                                 break;
6599                         }
6600
6601                         if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6602                                 break;
6603
6604                         dtrace_bcopy((void *)(uintptr_t)regs[r1],
6605                             (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6606                         break;
6607
6608                 case DIF_OP_STB:
6609                         if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6610                                 *flags |= CPU_DTRACE_BADADDR;
6611                                 *illval = regs[rd];
6612                                 break;
6613                         }
6614                         *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6615                         break;
6616
6617                 case DIF_OP_STH:
6618                         if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6619                                 *flags |= CPU_DTRACE_BADADDR;
6620                                 *illval = regs[rd];
6621                                 break;
6622                         }
6623                         if (regs[rd] & 1) {
6624                                 *flags |= CPU_DTRACE_BADALIGN;
6625                                 *illval = regs[rd];
6626                                 break;
6627                         }
6628                         *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6629                         break;
6630
6631                 case DIF_OP_STW:
6632                         if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6633                                 *flags |= CPU_DTRACE_BADADDR;
6634                                 *illval = regs[rd];
6635                                 break;
6636                         }
6637                         if (regs[rd] & 3) {
6638                                 *flags |= CPU_DTRACE_BADALIGN;
6639                                 *illval = regs[rd];
6640                                 break;
6641                         }
6642                         *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6643                         break;
6644
6645                 case DIF_OP_STX:
6646                         if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6647                                 *flags |= CPU_DTRACE_BADADDR;
6648                                 *illval = regs[rd];
6649                                 break;
6650                         }
6651                         if (regs[rd] & 7) {
6652                                 *flags |= CPU_DTRACE_BADALIGN;
6653                                 *illval = regs[rd];
6654                                 break;
6655                         }
6656                         *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6657                         break;
6658                 }
6659         }
6660
6661         if (!(*flags & CPU_DTRACE_FAULT))
6662                 return (rval);
6663
6664         mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6665         mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6666
6667         return (0);
6668 }
6669
6670 static void
6671 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6672 {
6673         dtrace_probe_t *probe = ecb->dte_probe;
6674         dtrace_provider_t *prov = probe->dtpr_provider;
6675         char c[DTRACE_FULLNAMELEN + 80], *str;
6676         char *msg = "dtrace: breakpoint action at probe ";
6677         char *ecbmsg = " (ecb ";
6678         uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6679         uintptr_t val = (uintptr_t)ecb;
6680         int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6681
6682         if (dtrace_destructive_disallow)
6683                 return;
6684
6685         /*
6686          * It's impossible to be taking action on the NULL probe.
6687          */
6688         ASSERT(probe != NULL);
6689
6690         /*
6691          * This is a poor man's (destitute man's?) sprintf():  we want to
6692          * print the provider name, module name, function name and name of
6693          * the probe, along with the hex address of the ECB with the breakpoint
6694          * action -- all of which we must place in the character buffer by
6695          * hand.
6696          */
6697         while (*msg != '\0')
6698                 c[i++] = *msg++;
6699
6700         for (str = prov->dtpv_name; *str != '\0'; str++)
6701                 c[i++] = *str;
6702         c[i++] = ':';
6703
6704         for (str = probe->dtpr_mod; *str != '\0'; str++)
6705                 c[i++] = *str;
6706         c[i++] = ':';
6707
6708         for (str = probe->dtpr_func; *str != '\0'; str++)
6709                 c[i++] = *str;
6710         c[i++] = ':';
6711
6712         for (str = probe->dtpr_name; *str != '\0'; str++)
6713                 c[i++] = *str;
6714
6715         while (*ecbmsg != '\0')
6716                 c[i++] = *ecbmsg++;
6717
6718         while (shift >= 0) {
6719                 mask = (uintptr_t)0xf << shift;
6720
6721                 if (val >= ((uintptr_t)1 << shift))
6722                         c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6723                 shift -= 4;
6724         }
6725
6726         c[i++] = ')';
6727         c[i] = '\0';
6728
6729 #if defined(sun)
6730         debug_enter(c);
6731 #else
6732         kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
6733 #endif
6734 }
6735
6736 static void
6737 dtrace_action_panic(dtrace_ecb_t *ecb)
6738 {
6739         dtrace_probe_t *probe = ecb->dte_probe;
6740
6741         /*
6742          * It's impossible to be taking action on the NULL probe.
6743          */
6744         ASSERT(probe != NULL);
6745
6746         if (dtrace_destructive_disallow)
6747                 return;
6748
6749         if (dtrace_panicked != NULL)
6750                 return;
6751
6752         if (dtrace_casptr(&dtrace_panicked, NULL, curthread) != NULL)
6753                 return;
6754
6755         /*
6756          * We won the right to panic.  (We want to be sure that only one
6757          * thread calls panic() from dtrace_probe(), and that panic() is
6758          * called exactly once.)
6759          */
6760         dtrace_panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6761             probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6762             probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6763 }
6764
6765 static void
6766 dtrace_action_raise(uint64_t sig)
6767 {
6768         if (dtrace_destructive_disallow)
6769                 return;
6770
6771         if (sig >= NSIG) {
6772                 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6773                 return;
6774         }
6775
6776 #if defined(sun)
6777         /*
6778          * raise() has a queue depth of 1 -- we ignore all subsequent
6779          * invocations of the raise() action.
6780          */
6781         if (curthread->t_dtrace_sig == 0)
6782                 curthread->t_dtrace_sig = (uint8_t)sig;
6783
6784         curthread->t_sig_check = 1;
6785         aston(curthread);
6786 #else
6787         struct proc *p = curproc;
6788         PROC_LOCK(p);
6789         kern_psignal(p, sig);
6790         PROC_UNLOCK(p);
6791 #endif
6792 }
6793
6794 static void
6795 dtrace_action_stop(void)
6796 {
6797         if (dtrace_destructive_disallow)
6798                 return;
6799
6800 #if defined(sun)
6801         if (!curthread->t_dtrace_stop) {
6802                 curthread->t_dtrace_stop = 1;
6803                 curthread->t_sig_check = 1;
6804                 aston(curthread);
6805         }
6806 #else
6807         struct proc *p = curproc;
6808         PROC_LOCK(p);
6809         kern_psignal(p, SIGSTOP);
6810         PROC_UNLOCK(p);
6811 #endif
6812 }
6813
6814 static void
6815 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6816 {
6817         hrtime_t now;
6818         volatile uint16_t *flags;
6819 #if defined(sun)
6820         cpu_t *cpu = CPU;
6821 #else
6822         cpu_t *cpu = &solaris_cpu[curcpu];
6823 #endif
6824
6825         if (dtrace_destructive_disallow)
6826                 return;
6827
6828         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6829
6830         now = dtrace_gethrtime();
6831
6832         if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6833                 /*
6834                  * We need to advance the mark to the current time.
6835                  */
6836                 cpu->cpu_dtrace_chillmark = now;
6837                 cpu->cpu_dtrace_chilled = 0;
6838         }
6839
6840         /*
6841          * Now check to see if the requested chill time would take us over
6842          * the maximum amount of time allowed in the chill interval.  (Or
6843          * worse, if the calculation itself induces overflow.)
6844          */
6845         if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6846             cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6847                 *flags |= CPU_DTRACE_ILLOP;
6848                 return;
6849         }
6850
6851         while (dtrace_gethrtime() - now < val)
6852                 continue;
6853
6854         /*
6855          * Normally, we assure that the value of the variable "timestamp" does
6856          * not change within an ECB.  The presence of chill() represents an
6857          * exception to this rule, however.
6858          */
6859         mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6860         cpu->cpu_dtrace_chilled += val;
6861 }
6862
6863 static void
6864 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6865     uint64_t *buf, uint64_t arg)
6866 {
6867         int nframes = DTRACE_USTACK_NFRAMES(arg);
6868         int strsize = DTRACE_USTACK_STRSIZE(arg);
6869         uint64_t *pcs = &buf[1], *fps;
6870         char *str = (char *)&pcs[nframes];
6871         int size, offs = 0, i, j;
6872         uintptr_t old = mstate->dtms_scratch_ptr, saved;
6873         uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
6874         char *sym;
6875
6876         /*
6877          * Should be taking a faster path if string space has not been
6878          * allocated.
6879          */
6880         ASSERT(strsize != 0);
6881
6882         /*
6883          * We will first allocate some temporary space for the frame pointers.
6884          */
6885         fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6886         size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6887             (nframes * sizeof (uint64_t));
6888
6889         if (!DTRACE_INSCRATCH(mstate, size)) {
6890                 /*
6891                  * Not enough room for our frame pointers -- need to indicate
6892                  * that we ran out of scratch space.
6893                  */
6894                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6895                 return;
6896         }
6897
6898         mstate->dtms_scratch_ptr += size;
6899         saved = mstate->dtms_scratch_ptr;
6900
6901         /*
6902          * Now get a stack with both program counters and frame pointers.
6903          */
6904         DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6905         dtrace_getufpstack(buf, fps, nframes + 1);
6906         DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6907
6908         /*
6909          * If that faulted, we're cooked.
6910          */
6911         if (*flags & CPU_DTRACE_FAULT)
6912                 goto out;
6913
6914         /*
6915          * Now we want to walk up the stack, calling the USTACK helper.  For
6916          * each iteration, we restore the scratch pointer.
6917          */
6918         for (i = 0; i < nframes; i++) {
6919                 mstate->dtms_scratch_ptr = saved;
6920
6921                 if (offs >= strsize)
6922                         break;
6923
6924                 sym = (char *)(uintptr_t)dtrace_helper(
6925                     DTRACE_HELPER_ACTION_USTACK,
6926                     mstate, state, pcs[i], fps[i]);
6927
6928                 /*
6929                  * If we faulted while running the helper, we're going to
6930                  * clear the fault and null out the corresponding string.
6931                  */
6932                 if (*flags & CPU_DTRACE_FAULT) {
6933                         *flags &= ~CPU_DTRACE_FAULT;
6934                         str[offs++] = '\0';
6935                         continue;
6936                 }
6937
6938                 if (sym == NULL) {
6939                         str[offs++] = '\0';
6940                         continue;
6941                 }
6942
6943                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6944
6945                 /*
6946                  * Now copy in the string that the helper returned to us.
6947                  */
6948                 for (j = 0; offs + j < strsize; j++) {
6949                         if ((str[offs + j] = sym[j]) == '\0')
6950                                 break;
6951                 }
6952
6953                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6954
6955                 offs += j + 1;
6956         }
6957
6958         if (offs >= strsize) {
6959                 /*
6960                  * If we didn't have room for all of the strings, we don't
6961                  * abort processing -- this needn't be a fatal error -- but we
6962                  * still want to increment a counter (dts_stkstroverflows) to
6963                  * allow this condition to be warned about.  (If this is from
6964                  * a jstack() action, it is easily tuned via jstackstrsize.)
6965                  */
6966                 dtrace_error(&state->dts_stkstroverflows);
6967         }
6968
6969         while (offs < strsize)
6970                 str[offs++] = '\0';
6971
6972 out:
6973         mstate->dtms_scratch_ptr = old;
6974 }
6975
6976 static void
6977 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6978     size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6979 {
6980         volatile uint16_t *flags;
6981         uint64_t val = *valp;
6982         size_t valoffs = *valoffsp;
6983
6984         flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
6985         ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6986
6987         /*
6988          * If this is a string, we're going to only load until we find the zero
6989          * byte -- after which we'll store zero bytes.
6990          */
6991         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6992                 char c = '\0' + 1;
6993                 size_t s;
6994
6995                 for (s = 0; s < size; s++) {
6996                         if (c != '\0' && dtkind == DIF_TF_BYREF) {
6997                                 c = dtrace_load8(val++);
6998                         } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6999                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7000                                 c = dtrace_fuword8((void *)(uintptr_t)val++);
7001                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7002                                 if (*flags & CPU_DTRACE_FAULT)
7003                                         break;
7004                         }
7005
7006                         DTRACE_STORE(uint8_t, tomax, valoffs++, c);
7007
7008                         if (c == '\0' && intuple)
7009                                 break;
7010                 }
7011         } else {
7012                 uint8_t c;
7013                 while (valoffs < end) {
7014                         if (dtkind == DIF_TF_BYREF) {
7015                                 c = dtrace_load8(val++);
7016                         } else if (dtkind == DIF_TF_BYUREF) {
7017                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7018                                 c = dtrace_fuword8((void *)(uintptr_t)val++);
7019                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7020                                 if (*flags & CPU_DTRACE_FAULT)
7021                                         break;
7022                         }
7023
7024                         DTRACE_STORE(uint8_t, tomax,
7025                             valoffs++, c);
7026                 }
7027         }
7028
7029         *valp = val;
7030         *valoffsp = valoffs;
7031 }
7032
7033 /*
7034  * If you're looking for the epicenter of DTrace, you just found it.  This
7035  * is the function called by the provider to fire a probe -- from which all
7036  * subsequent probe-context DTrace activity emanates.
7037  */
7038 void
7039 dtrace_probe(dtrace_id_t id, uintptr_t arg0, uintptr_t arg1,
7040     uintptr_t arg2, uintptr_t arg3, uintptr_t arg4)
7041 {
7042         processorid_t cpuid;
7043         dtrace_icookie_t cookie;
7044         dtrace_probe_t *probe;
7045         dtrace_mstate_t mstate;
7046         dtrace_ecb_t *ecb;
7047         dtrace_action_t *act;
7048         intptr_t offs;
7049         size_t size;
7050         int vtime, onintr;
7051         volatile uint16_t *flags;
7052         hrtime_t now;
7053
7054         if (panicstr != NULL)
7055                 return;
7056
7057 #if defined(sun)
7058         /*
7059          * Kick out immediately if this CPU is still being born (in which case
7060          * curthread will be set to -1) or the current thread can't allow
7061          * probes in its current context.
7062          */
7063         if (((uintptr_t)curthread & 1) || (curthread->t_flag & T_DONTDTRACE))
7064                 return;
7065 #endif
7066
7067         cookie = dtrace_interrupt_disable();
7068         probe = dtrace_probes[id - 1];
7069         cpuid = curcpu;
7070         onintr = CPU_ON_INTR(CPU);
7071
7072         if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
7073             probe->dtpr_predcache == curthread->t_predcache) {
7074                 /*
7075                  * We have hit in the predicate cache; we know that
7076                  * this predicate would evaluate to be false.
7077                  */
7078                 dtrace_interrupt_enable(cookie);
7079                 return;
7080         }
7081
7082 #if defined(sun)
7083         if (panic_quiesce) {
7084 #else
7085         if (panicstr != NULL) {
7086 #endif
7087                 /*
7088                  * We don't trace anything if we're panicking.
7089                  */
7090                 dtrace_interrupt_enable(cookie);
7091                 return;
7092         }
7093
7094         now = mstate.dtms_timestamp = dtrace_gethrtime();
7095         mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7096         vtime = dtrace_vtime_references != 0;
7097
7098         if (vtime && curthread->t_dtrace_start)
7099                 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
7100
7101         mstate.dtms_difo = NULL;
7102         mstate.dtms_probe = probe;
7103         mstate.dtms_strtok = 0;
7104         mstate.dtms_arg[0] = arg0;
7105         mstate.dtms_arg[1] = arg1;
7106         mstate.dtms_arg[2] = arg2;
7107         mstate.dtms_arg[3] = arg3;
7108         mstate.dtms_arg[4] = arg4;
7109
7110         flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7111
7112         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7113                 dtrace_predicate_t *pred = ecb->dte_predicate;
7114                 dtrace_state_t *state = ecb->dte_state;
7115                 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7116                 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7117                 dtrace_vstate_t *vstate = &state->dts_vstate;
7118                 dtrace_provider_t *prov = probe->dtpr_provider;
7119                 uint64_t tracememsize = 0;
7120                 int committed = 0;
7121                 caddr_t tomax;
7122
7123                 /*
7124                  * A little subtlety with the following (seemingly innocuous)
7125                  * declaration of the automatic 'val':  by looking at the
7126                  * code, you might think that it could be declared in the
7127                  * action processing loop, below.  (That is, it's only used in
7128                  * the action processing loop.)  However, it must be declared
7129                  * out of that scope because in the case of DIF expression
7130                  * arguments to aggregating actions, one iteration of the
7131                  * action loop will use the last iteration's value.
7132                  */
7133                 uint64_t val = 0;
7134
7135                 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7136                 mstate.dtms_getf = NULL;
7137
7138                 *flags &= ~CPU_DTRACE_ERROR;
7139
7140                 if (prov == dtrace_provider) {
7141                         /*
7142                          * If dtrace itself is the provider of this probe,
7143                          * we're only going to continue processing the ECB if
7144                          * arg0 (the dtrace_state_t) is equal to the ECB's
7145                          * creating state.  (This prevents disjoint consumers
7146                          * from seeing one another's metaprobes.)
7147                          */
7148                         if (arg0 != (uint64_t)(uintptr_t)state)
7149                                 continue;
7150                 }
7151
7152                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7153                         /*
7154                          * We're not currently active.  If our provider isn't
7155                          * the dtrace pseudo provider, we're not interested.
7156                          */
7157                         if (prov != dtrace_provider)
7158                                 continue;
7159
7160                         /*
7161                          * Now we must further check if we are in the BEGIN
7162                          * probe.  If we are, we will only continue processing
7163                          * if we're still in WARMUP -- if one BEGIN enabling
7164                          * has invoked the exit() action, we don't want to
7165                          * evaluate subsequent BEGIN enablings.
7166                          */
7167                         if (probe->dtpr_id == dtrace_probeid_begin &&
7168                             state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7169                                 ASSERT(state->dts_activity ==
7170                                     DTRACE_ACTIVITY_DRAINING);
7171                                 continue;
7172                         }
7173                 }
7174
7175                 if (ecb->dte_cond) {
7176                         /*
7177                          * If the dte_cond bits indicate that this
7178                          * consumer is only allowed to see user-mode firings
7179                          * of this probe, call the provider's dtps_usermode()
7180                          * entry point to check that the probe was fired
7181                          * while in a user context. Skip this ECB if that's
7182                          * not the case.
7183                          */
7184                         if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7185                             prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7186                             probe->dtpr_id, probe->dtpr_arg) == 0)
7187                                 continue;
7188
7189 #if defined(sun)
7190                         /*
7191                          * This is more subtle than it looks. We have to be
7192                          * absolutely certain that CRED() isn't going to
7193                          * change out from under us so it's only legit to
7194                          * examine that structure if we're in constrained
7195                          * situations. Currently, the only times we'll this
7196                          * check is if a non-super-user has enabled the
7197                          * profile or syscall providers -- providers that
7198                          * allow visibility of all processes. For the
7199                          * profile case, the check above will ensure that
7200                          * we're examining a user context.
7201                          */
7202                         if (ecb->dte_cond & DTRACE_COND_OWNER) {
7203                                 cred_t *cr;
7204                                 cred_t *s_cr =
7205                                     ecb->dte_state->dts_cred.dcr_cred;
7206                                 proc_t *proc;
7207
7208                                 ASSERT(s_cr != NULL);
7209
7210                                 if ((cr = CRED()) == NULL ||
7211                                     s_cr->cr_uid != cr->cr_uid ||
7212                                     s_cr->cr_uid != cr->cr_ruid ||
7213                                     s_cr->cr_uid != cr->cr_suid ||
7214                                     s_cr->cr_gid != cr->cr_gid ||
7215                                     s_cr->cr_gid != cr->cr_rgid ||
7216                                     s_cr->cr_gid != cr->cr_sgid ||
7217                                     (proc = ttoproc(curthread)) == NULL ||
7218                                     (proc->p_flag & SNOCD))
7219                                         continue;
7220                         }
7221
7222                         if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7223                                 cred_t *cr;
7224                                 cred_t *s_cr =
7225                                     ecb->dte_state->dts_cred.dcr_cred;
7226
7227                                 ASSERT(s_cr != NULL);
7228
7229                                 if ((cr = CRED()) == NULL ||
7230                                     s_cr->cr_zone->zone_id !=
7231                                     cr->cr_zone->zone_id)
7232                                         continue;
7233                         }
7234 #endif
7235                 }
7236
7237                 if (now - state->dts_alive > dtrace_deadman_timeout) {
7238                         /*
7239                          * We seem to be dead.  Unless we (a) have kernel
7240                          * destructive permissions (b) have explicitly enabled
7241                          * destructive actions and (c) destructive actions have
7242                          * not been disabled, we're going to transition into
7243                          * the KILLED state, from which no further processing
7244                          * on this state will be performed.
7245                          */
7246                         if (!dtrace_priv_kernel_destructive(state) ||
7247                             !state->dts_cred.dcr_destructive ||
7248                             dtrace_destructive_disallow) {
7249                                 void *activity = &state->dts_activity;
7250                                 dtrace_activity_t current;
7251
7252                                 do {
7253                                         current = state->dts_activity;
7254                                 } while (dtrace_cas32(activity, current,
7255                                     DTRACE_ACTIVITY_KILLED) != current);
7256
7257                                 continue;
7258                         }
7259                 }
7260
7261                 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7262                     ecb->dte_alignment, state, &mstate)) < 0)
7263                         continue;
7264
7265                 tomax = buf->dtb_tomax;
7266                 ASSERT(tomax != NULL);
7267
7268                 if (ecb->dte_size != 0) {
7269                         dtrace_rechdr_t dtrh;
7270                         if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7271                                 mstate.dtms_timestamp = dtrace_gethrtime();
7272                                 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7273                         }
7274                         ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
7275                         dtrh.dtrh_epid = ecb->dte_epid;
7276                         DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
7277                             mstate.dtms_timestamp);
7278                         *((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
7279                 }
7280
7281                 mstate.dtms_epid = ecb->dte_epid;
7282                 mstate.dtms_present |= DTRACE_MSTATE_EPID;
7283
7284                 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7285                         mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7286                 else
7287                         mstate.dtms_access = 0;
7288
7289                 if (pred != NULL) {
7290                         dtrace_difo_t *dp = pred->dtp_difo;
7291                         int rval;
7292
7293                         rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7294
7295                         if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7296                                 dtrace_cacheid_t cid = probe->dtpr_predcache;
7297
7298                                 if (cid != DTRACE_CACHEIDNONE && !onintr) {
7299                                         /*
7300                                          * Update the predicate cache...
7301                                          */
7302                                         ASSERT(cid == pred->dtp_cacheid);
7303                                         curthread->t_predcache = cid;
7304                                 }
7305
7306                                 continue;
7307                         }
7308                 }
7309
7310                 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7311                     act != NULL; act = act->dta_next) {
7312                         size_t valoffs;
7313                         dtrace_difo_t *dp;
7314                         dtrace_recdesc_t *rec = &act->dta_rec;
7315
7316                         size = rec->dtrd_size;
7317                         valoffs = offs + rec->dtrd_offset;
7318
7319                         if (DTRACEACT_ISAGG(act->dta_kind)) {
7320                                 uint64_t v = 0xbad;
7321                                 dtrace_aggregation_t *agg;
7322
7323                                 agg = (dtrace_aggregation_t *)act;
7324
7325                                 if ((dp = act->dta_difo) != NULL)
7326                                         v = dtrace_dif_emulate(dp,
7327                                             &mstate, vstate, state);
7328
7329                                 if (*flags & CPU_DTRACE_ERROR)
7330                                         continue;
7331
7332                                 /*
7333                                  * Note that we always pass the expression
7334                                  * value from the previous iteration of the
7335                                  * action loop.  This value will only be used
7336                                  * if there is an expression argument to the
7337                                  * aggregating action, denoted by the
7338                                  * dtag_hasarg field.
7339                                  */
7340                                 dtrace_aggregate(agg, buf,
7341                                     offs, aggbuf, v, val);
7342                                 continue;
7343                         }
7344
7345                         switch (act->dta_kind) {
7346                         case DTRACEACT_STOP:
7347                                 if (dtrace_priv_proc_destructive(state))
7348                                         dtrace_action_stop();
7349                                 continue;
7350
7351                         case DTRACEACT_BREAKPOINT:
7352                                 if (dtrace_priv_kernel_destructive(state))
7353                                         dtrace_action_breakpoint(ecb);
7354                                 continue;
7355
7356                         case DTRACEACT_PANIC:
7357                                 if (dtrace_priv_kernel_destructive(state))
7358                                         dtrace_action_panic(ecb);
7359                                 continue;
7360
7361                         case DTRACEACT_STACK:
7362                                 if (!dtrace_priv_kernel(state))
7363                                         continue;
7364
7365                                 dtrace_getpcstack((pc_t *)(tomax + valoffs),
7366                                     size / sizeof (pc_t), probe->dtpr_aframes,
7367                                     DTRACE_ANCHORED(probe) ? NULL :
7368                                     (uint32_t *)arg0);
7369                                 continue;
7370
7371                         case DTRACEACT_JSTACK:
7372                         case DTRACEACT_USTACK:
7373                                 if (!dtrace_priv_proc(state))
7374                                         continue;
7375
7376                                 /*
7377                                  * See comment in DIF_VAR_PID.
7378                                  */
7379                                 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7380                                     CPU_ON_INTR(CPU)) {
7381                                         int depth = DTRACE_USTACK_NFRAMES(
7382                                             rec->dtrd_arg) + 1;
7383
7384                                         dtrace_bzero((void *)(tomax + valoffs),
7385                                             DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7386                                             + depth * sizeof (uint64_t));
7387
7388                                         continue;
7389                                 }
7390
7391                                 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7392                                     curproc->p_dtrace_helpers != NULL) {
7393                                         /*
7394                                          * This is the slow path -- we have
7395                                          * allocated string space, and we're
7396                                          * getting the stack of a process that
7397                                          * has helpers.  Call into a separate
7398                                          * routine to perform this processing.
7399                                          */
7400                                         dtrace_action_ustack(&mstate, state,
7401                                             (uint64_t *)(tomax + valoffs),
7402                                             rec->dtrd_arg);
7403                                         continue;
7404                                 }
7405
7406                                 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7407                                 dtrace_getupcstack((uint64_t *)
7408                                     (tomax + valoffs),
7409                                     DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7410                                 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7411                                 continue;
7412
7413                         default:
7414                                 break;
7415                         }
7416
7417                         dp = act->dta_difo;
7418                         ASSERT(dp != NULL);
7419
7420                         val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7421
7422                         if (*flags & CPU_DTRACE_ERROR)
7423                                 continue;
7424
7425                         switch (act->dta_kind) {
7426                         case DTRACEACT_SPECULATE: {
7427                                 dtrace_rechdr_t *dtrh;
7428
7429                                 ASSERT(buf == &state->dts_buffer[cpuid]);
7430                                 buf = dtrace_speculation_buffer(state,
7431                                     cpuid, val);
7432
7433                                 if (buf == NULL) {
7434                                         *flags |= CPU_DTRACE_DROP;
7435                                         continue;
7436                                 }
7437
7438                                 offs = dtrace_buffer_reserve(buf,
7439                                     ecb->dte_needed, ecb->dte_alignment,
7440                                     state, NULL);
7441
7442                                 if (offs < 0) {
7443                                         *flags |= CPU_DTRACE_DROP;
7444                                         continue;
7445                                 }
7446
7447                                 tomax = buf->dtb_tomax;
7448                                 ASSERT(tomax != NULL);
7449
7450                                 if (ecb->dte_size == 0)
7451                                         continue;
7452
7453                                 ASSERT3U(ecb->dte_size, >=,
7454                                     sizeof (dtrace_rechdr_t));
7455                                 dtrh = ((void *)(tomax + offs));
7456                                 dtrh->dtrh_epid = ecb->dte_epid;
7457                                 /*
7458                                  * When the speculation is committed, all of
7459                                  * the records in the speculative buffer will
7460                                  * have their timestamps set to the commit
7461                                  * time.  Until then, it is set to a sentinel
7462                                  * value, for debugability.
7463                                  */
7464                                 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7465                                 continue;
7466                         }
7467
7468                         case DTRACEACT_PRINTM: {
7469                                 /* The DIF returns a 'memref'. */
7470                                 uintptr_t *memref = (uintptr_t *)(uintptr_t) val;
7471
7472                                 /* Get the size from the memref. */
7473                                 size = memref[1];
7474
7475                                 /*
7476                                  * Check if the size exceeds the allocated
7477                                  * buffer size.
7478                                  */
7479                                 if (size + sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7480                                         /* Flag a drop! */
7481                                         *flags |= CPU_DTRACE_DROP;
7482                                         continue;
7483                                 }
7484
7485                                 /* Store the size in the buffer first. */
7486                                 DTRACE_STORE(uintptr_t, tomax,
7487                                     valoffs, size);
7488
7489                                 /*
7490                                  * Offset the buffer address to the start
7491                                  * of the data.
7492                                  */
7493                                 valoffs += sizeof(uintptr_t);
7494
7495                                 /*
7496                                  * Reset to the memory address rather than
7497                                  * the memref array, then let the BYREF
7498                                  * code below do the work to store the 
7499                                  * memory data in the buffer.
7500                                  */
7501                                 val = memref[0];
7502                                 break;
7503                         }
7504
7505                         case DTRACEACT_PRINTT: {
7506                                 /* The DIF returns a 'typeref'. */
7507                                 uintptr_t *typeref = (uintptr_t *)(uintptr_t) val;
7508                                 char c = '\0' + 1;
7509                                 size_t s;
7510
7511                                 /*
7512                                  * Get the type string length and round it
7513                                  * up so that the data that follows is
7514                                  * aligned for easy access.
7515                                  */
7516                                 size_t typs = strlen((char *) typeref[2]) + 1;
7517                                 typs = roundup(typs,  sizeof(uintptr_t));
7518
7519                                 /*
7520                                  *Get the size from the typeref using the
7521                                  * number of elements and the type size.
7522                                  */
7523                                 size = typeref[1] * typeref[3];
7524
7525                                 /*
7526                                  * Check if the size exceeds the allocated
7527                                  * buffer size.
7528                                  */
7529                                 if (size + typs + 2 * sizeof(uintptr_t) > dp->dtdo_rtype.dtdt_size) {
7530                                         /* Flag a drop! */
7531                                         *flags |= CPU_DTRACE_DROP;
7532                                 
7533                                 }
7534
7535                                 /* Store the size in the buffer first. */
7536                                 DTRACE_STORE(uintptr_t, tomax,
7537                                     valoffs, size);
7538                                 valoffs += sizeof(uintptr_t);
7539
7540                                 /* Store the type size in the buffer. */
7541                                 DTRACE_STORE(uintptr_t, tomax,
7542                                     valoffs, typeref[3]);
7543                                 valoffs += sizeof(uintptr_t);
7544
7545                                 val = typeref[2];
7546
7547                                 for (s = 0; s < typs; s++) {
7548                                         if (c != '\0')
7549                                                 c = dtrace_load8(val++);
7550
7551                                         DTRACE_STORE(uint8_t, tomax,
7552                                             valoffs++, c);
7553                                 }
7554
7555                                 /*
7556                                  * Reset to the memory address rather than
7557                                  * the typeref array, then let the BYREF
7558                                  * code below do the work to store the 
7559                                  * memory data in the buffer.
7560                                  */
7561                                 val = typeref[0];
7562                                 break;
7563                         }
7564
7565                         case DTRACEACT_CHILL:
7566                                 if (dtrace_priv_kernel_destructive(state))
7567                                         dtrace_action_chill(&mstate, val);
7568                                 continue;
7569
7570                         case DTRACEACT_RAISE:
7571                                 if (dtrace_priv_proc_destructive(state))
7572                                         dtrace_action_raise(val);
7573                                 continue;
7574
7575                         case DTRACEACT_COMMIT:
7576                                 ASSERT(!committed);
7577
7578                                 /*
7579                                  * We need to commit our buffer state.
7580                                  */
7581                                 if (ecb->dte_size)
7582                                         buf->dtb_offset = offs + ecb->dte_size;
7583                                 buf = &state->dts_buffer[cpuid];
7584                                 dtrace_speculation_commit(state, cpuid, val);
7585                                 committed = 1;
7586                                 continue;
7587
7588                         case DTRACEACT_DISCARD:
7589                                 dtrace_speculation_discard(state, cpuid, val);
7590                                 continue;
7591
7592                         case DTRACEACT_DIFEXPR:
7593                         case DTRACEACT_LIBACT:
7594                         case DTRACEACT_PRINTF:
7595                         case DTRACEACT_PRINTA:
7596                         case DTRACEACT_SYSTEM:
7597                         case DTRACEACT_FREOPEN:
7598                         case DTRACEACT_TRACEMEM:
7599                                 break;
7600
7601                         case DTRACEACT_TRACEMEM_DYNSIZE:
7602                                 tracememsize = val;
7603                                 break;
7604
7605                         case DTRACEACT_SYM:
7606                         case DTRACEACT_MOD:
7607                                 if (!dtrace_priv_kernel(state))
7608                                         continue;
7609                                 break;
7610
7611                         case DTRACEACT_USYM:
7612                         case DTRACEACT_UMOD:
7613                         case DTRACEACT_UADDR: {
7614 #if defined(sun)
7615                                 struct pid *pid = curthread->t_procp->p_pidp;
7616 #endif
7617
7618                                 if (!dtrace_priv_proc(state))
7619                                         continue;
7620
7621                                 DTRACE_STORE(uint64_t, tomax,
7622 #if defined(sun)
7623                                     valoffs, (uint64_t)pid->pid_id);
7624 #else
7625                                     valoffs, (uint64_t) curproc->p_pid);
7626 #endif
7627                                 DTRACE_STORE(uint64_t, tomax,
7628                                     valoffs + sizeof (uint64_t), val);
7629
7630                                 continue;
7631                         }
7632
7633                         case DTRACEACT_EXIT: {
7634                                 /*
7635                                  * For the exit action, we are going to attempt
7636                                  * to atomically set our activity to be
7637                                  * draining.  If this fails (either because
7638                                  * another CPU has beat us to the exit action,
7639                                  * or because our current activity is something
7640                                  * other than ACTIVE or WARMUP), we will
7641                                  * continue.  This assures that the exit action
7642                                  * can be successfully recorded at most once
7643                                  * when we're in the ACTIVE state.  If we're
7644                                  * encountering the exit() action while in
7645                                  * COOLDOWN, however, we want to honor the new
7646                                  * status code.  (We know that we're the only
7647                                  * thread in COOLDOWN, so there is no race.)
7648                                  */
7649                                 void *activity = &state->dts_activity;
7650                                 dtrace_activity_t current = state->dts_activity;
7651
7652                                 if (current == DTRACE_ACTIVITY_COOLDOWN)
7653                                         break;
7654
7655                                 if (current != DTRACE_ACTIVITY_WARMUP)
7656                                         current = DTRACE_ACTIVITY_ACTIVE;
7657
7658                                 if (dtrace_cas32(activity, current,
7659                                     DTRACE_ACTIVITY_DRAINING) != current) {
7660                                         *flags |= CPU_DTRACE_DROP;
7661                                         continue;
7662                                 }
7663
7664                                 break;
7665                         }
7666
7667                         default:
7668                                 ASSERT(0);
7669                         }
7670
7671                         if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
7672                             dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
7673                                 uintptr_t end = valoffs + size;
7674
7675                                 if (tracememsize != 0 &&
7676                                     valoffs + tracememsize < end) {
7677                                         end = valoffs + tracememsize;
7678                                         tracememsize = 0;
7679                                 }
7680
7681                                 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7682                                     !dtrace_vcanload((void *)(uintptr_t)val,
7683                                     &dp->dtdo_rtype, &mstate, vstate))
7684                                         continue;
7685
7686                                 dtrace_store_by_ref(dp, tomax, size, &valoffs,
7687                                     &val, end, act->dta_intuple,
7688                                     dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7689                                     DIF_TF_BYREF: DIF_TF_BYUREF);
7690                                 continue;
7691                         }
7692
7693                         switch (size) {
7694                         case 0:
7695                                 break;
7696
7697                         case sizeof (uint8_t):
7698                                 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7699                                 break;
7700                         case sizeof (uint16_t):
7701                                 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7702                                 break;
7703                         case sizeof (uint32_t):
7704                                 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7705                                 break;
7706                         case sizeof (uint64_t):
7707                                 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7708                                 break;
7709                         default:
7710                                 /*
7711                                  * Any other size should have been returned by
7712                                  * reference, not by value.
7713                                  */
7714                                 ASSERT(0);
7715                                 break;
7716                         }
7717                 }
7718
7719                 if (*flags & CPU_DTRACE_DROP)
7720                         continue;
7721
7722                 if (*flags & CPU_DTRACE_FAULT) {
7723                         int ndx;
7724                         dtrace_action_t *err;
7725
7726                         buf->dtb_errors++;
7727
7728                         if (probe->dtpr_id == dtrace_probeid_error) {
7729                                 /*
7730                                  * There's nothing we can do -- we had an
7731                                  * error on the error probe.  We bump an
7732                                  * error counter to at least indicate that
7733                                  * this condition happened.
7734                                  */
7735                                 dtrace_error(&state->dts_dblerrors);
7736                                 continue;
7737                         }
7738
7739                         if (vtime) {
7740                                 /*
7741                                  * Before recursing on dtrace_probe(), we
7742                                  * need to explicitly clear out our start
7743                                  * time to prevent it from being accumulated
7744                                  * into t_dtrace_vtime.
7745                                  */
7746                                 curthread->t_dtrace_start = 0;
7747                         }
7748
7749                         /*
7750                          * Iterate over the actions to figure out which action
7751                          * we were processing when we experienced the error.
7752                          * Note that act points _past_ the faulting action; if
7753                          * act is ecb->dte_action, the fault was in the
7754                          * predicate, if it's ecb->dte_action->dta_next it's
7755                          * in action #1, and so on.
7756                          */
7757                         for (err = ecb->dte_action, ndx = 0;
7758                             err != act; err = err->dta_next, ndx++)
7759                                 continue;
7760
7761                         dtrace_probe_error(state, ecb->dte_epid, ndx,
7762                             (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7763                             mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7764                             cpu_core[cpuid].cpuc_dtrace_illval);
7765
7766                         continue;
7767                 }
7768
7769                 if (!committed)
7770                         buf->dtb_offset = offs + ecb->dte_size;
7771         }
7772
7773         if (vtime)
7774                 curthread->t_dtrace_start = dtrace_gethrtime();
7775
7776         dtrace_interrupt_enable(cookie);
7777 }
7778
7779 /*
7780  * DTrace Probe Hashing Functions
7781  *
7782  * The functions in this section (and indeed, the functions in remaining
7783  * sections) are not _called_ from probe context.  (Any exceptions to this are
7784  * marked with a "Note:".)  Rather, they are called from elsewhere in the
7785  * DTrace framework to look-up probes in, add probes to and remove probes from
7786  * the DTrace probe hashes.  (Each probe is hashed by each element of the
7787  * probe tuple -- allowing for fast lookups, regardless of what was
7788  * specified.)
7789  */
7790 static uint_t
7791 dtrace_hash_str(const char *p)
7792 {
7793         unsigned int g;
7794         uint_t hval = 0;
7795
7796         while (*p) {
7797                 hval = (hval << 4) + *p++;
7798                 if ((g = (hval & 0xf0000000)) != 0)
7799                         hval ^= g >> 24;
7800                 hval &= ~g;
7801         }
7802         return (hval);
7803 }
7804
7805 static dtrace_hash_t *
7806 dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
7807 {
7808         dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7809
7810         hash->dth_stroffs = stroffs;
7811         hash->dth_nextoffs = nextoffs;
7812         hash->dth_prevoffs = prevoffs;
7813
7814         hash->dth_size = 1;
7815         hash->dth_mask = hash->dth_size - 1;
7816
7817         hash->dth_tab = kmem_zalloc(hash->dth_size *
7818             sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7819
7820         return (hash);
7821 }
7822
7823 static void
7824 dtrace_hash_destroy(dtrace_hash_t *hash)
7825 {
7826 #ifdef DEBUG
7827         int i;
7828
7829         for (i = 0; i < hash->dth_size; i++)
7830                 ASSERT(hash->dth_tab[i] == NULL);
7831 #endif
7832
7833         kmem_free(hash->dth_tab,
7834             hash->dth_size * sizeof (dtrace_hashbucket_t *));
7835         kmem_free(hash, sizeof (dtrace_hash_t));
7836 }
7837
7838 static void
7839 dtrace_hash_resize(dtrace_hash_t *hash)
7840 {
7841         int size = hash->dth_size, i, ndx;
7842         int new_size = hash->dth_size << 1;
7843         int new_mask = new_size - 1;
7844         dtrace_hashbucket_t **new_tab, *bucket, *next;
7845
7846         ASSERT((new_size & new_mask) == 0);
7847
7848         new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7849
7850         for (i = 0; i < size; i++) {
7851                 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7852                         dtrace_probe_t *probe = bucket->dthb_chain;
7853
7854                         ASSERT(probe != NULL);
7855                         ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
7856
7857                         next = bucket->dthb_next;
7858                         bucket->dthb_next = new_tab[ndx];
7859                         new_tab[ndx] = bucket;
7860                 }
7861         }
7862
7863         kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7864         hash->dth_tab = new_tab;
7865         hash->dth_size = new_size;
7866         hash->dth_mask = new_mask;
7867 }
7868
7869 static void
7870 dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
7871 {
7872         int hashval = DTRACE_HASHSTR(hash, new);
7873         int ndx = hashval & hash->dth_mask;
7874         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7875         dtrace_probe_t **nextp, **prevp;
7876
7877         for (; bucket != NULL; bucket = bucket->dthb_next) {
7878                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7879                         goto add;
7880         }
7881
7882         if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7883                 dtrace_hash_resize(hash);
7884                 dtrace_hash_add(hash, new);
7885                 return;
7886         }
7887
7888         bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7889         bucket->dthb_next = hash->dth_tab[ndx];
7890         hash->dth_tab[ndx] = bucket;
7891         hash->dth_nbuckets++;
7892
7893 add:
7894         nextp = DTRACE_HASHNEXT(hash, new);
7895         ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7896         *nextp = bucket->dthb_chain;
7897
7898         if (bucket->dthb_chain != NULL) {
7899                 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7900                 ASSERT(*prevp == NULL);
7901                 *prevp = new;
7902         }
7903
7904         bucket->dthb_chain = new;
7905         bucket->dthb_len++;
7906 }
7907
7908 static dtrace_probe_t *
7909 dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
7910 {
7911         int hashval = DTRACE_HASHSTR(hash, template);
7912         int ndx = hashval & hash->dth_mask;
7913         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7914
7915         for (; bucket != NULL; bucket = bucket->dthb_next) {
7916                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7917                         return (bucket->dthb_chain);
7918         }
7919
7920         return (NULL);
7921 }
7922
7923 static int
7924 dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
7925 {
7926         int hashval = DTRACE_HASHSTR(hash, template);
7927         int ndx = hashval & hash->dth_mask;
7928         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7929
7930         for (; bucket != NULL; bucket = bucket->dthb_next) {
7931                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7932                         return (bucket->dthb_len);
7933         }
7934
7935         return (0);
7936 }
7937
7938 static void
7939 dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
7940 {
7941         int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
7942         dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7943
7944         dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
7945         dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
7946
7947         /*
7948          * Find the bucket that we're removing this probe from.
7949          */
7950         for (; bucket != NULL; bucket = bucket->dthb_next) {
7951                 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
7952                         break;
7953         }
7954
7955         ASSERT(bucket != NULL);
7956
7957         if (*prevp == NULL) {
7958                 if (*nextp == NULL) {
7959                         /*
7960                          * The removed probe was the only probe on this
7961                          * bucket; we need to remove the bucket.
7962                          */
7963                         dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7964
7965                         ASSERT(bucket->dthb_chain == probe);
7966                         ASSERT(b != NULL);
7967
7968                         if (b == bucket) {
7969                                 hash->dth_tab[ndx] = bucket->dthb_next;
7970                         } else {
7971                                 while (b->dthb_next != bucket)
7972                                         b = b->dthb_next;
7973                                 b->dthb_next = bucket->dthb_next;
7974                         }
7975
7976                         ASSERT(hash->dth_nbuckets > 0);
7977                         hash->dth_nbuckets--;
7978                         kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7979                         return;
7980                 }
7981
7982                 bucket->dthb_chain = *nextp;
7983         } else {
7984                 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7985         }
7986
7987         if (*nextp != NULL)
7988                 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7989 }
7990
7991 /*
7992  * DTrace Utility Functions
7993  *
7994  * These are random utility functions that are _not_ called from probe context.
7995  */
7996 static int
7997 dtrace_badattr(const dtrace_attribute_t *a)
7998 {
7999         return (a->dtat_name > DTRACE_STABILITY_MAX ||
8000             a->dtat_data > DTRACE_STABILITY_MAX ||
8001             a->dtat_class > DTRACE_CLASS_MAX);
8002 }
8003
8004 /*
8005  * Return a duplicate copy of a string.  If the specified string is NULL,
8006  * this function returns a zero-length string.
8007  */
8008 static char *
8009 dtrace_strdup(const char *str)
8010 {
8011         char *new = kmem_zalloc((str != NULL ? strlen(str) : 0) + 1, KM_SLEEP);
8012
8013         if (str != NULL)
8014                 (void) strcpy(new, str);
8015
8016         return (new);
8017 }
8018
8019 #define DTRACE_ISALPHA(c)       \
8020         (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
8021
8022 static int
8023 dtrace_badname(const char *s)
8024 {
8025         char c;
8026
8027         if (s == NULL || (c = *s++) == '\0')
8028                 return (0);
8029
8030         if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
8031                 return (1);
8032
8033         while ((c = *s++) != '\0') {
8034                 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
8035                     c != '-' && c != '_' && c != '.' && c != '`')
8036                         return (1);
8037         }
8038
8039         return (0);
8040 }
8041
8042 static void
8043 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
8044 {
8045         uint32_t priv;
8046
8047 #if defined(sun)
8048         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
8049                 /*
8050                  * For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
8051                  */
8052                 priv = DTRACE_PRIV_ALL;
8053         } else {
8054                 *uidp = crgetuid(cr);
8055                 *zoneidp = crgetzoneid(cr);
8056
8057                 priv = 0;
8058                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
8059                         priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
8060                 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8061                         priv |= DTRACE_PRIV_USER;
8062                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8063                         priv |= DTRACE_PRIV_PROC;
8064                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8065                         priv |= DTRACE_PRIV_OWNER;
8066                 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8067                         priv |= DTRACE_PRIV_ZONEOWNER;
8068         }
8069 #else
8070         priv = DTRACE_PRIV_ALL;
8071 #endif
8072
8073         *privp = priv;
8074 }
8075
8076 #ifdef DTRACE_ERRDEBUG
8077 static void
8078 dtrace_errdebug(const char *str)
8079 {
8080         int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8081         int occupied = 0;
8082
8083         mutex_enter(&dtrace_errlock);
8084         dtrace_errlast = str;
8085         dtrace_errthread = curthread;
8086
8087         while (occupied++ < DTRACE_ERRHASHSZ) {
8088                 if (dtrace_errhash[hval].dter_msg == str) {
8089                         dtrace_errhash[hval].dter_count++;
8090                         goto out;
8091                 }
8092
8093                 if (dtrace_errhash[hval].dter_msg != NULL) {
8094                         hval = (hval + 1) % DTRACE_ERRHASHSZ;
8095                         continue;
8096                 }
8097
8098                 dtrace_errhash[hval].dter_msg = str;
8099                 dtrace_errhash[hval].dter_count = 1;
8100                 goto out;
8101         }
8102
8103         panic("dtrace: undersized error hash");
8104 out:
8105         mutex_exit(&dtrace_errlock);
8106 }
8107 #endif
8108
8109 /*
8110  * DTrace Matching Functions
8111  *
8112  * These functions are used to match groups of probes, given some elements of
8113  * a probe tuple, or some globbed expressions for elements of a probe tuple.
8114  */
8115 static int
8116 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8117     zoneid_t zoneid)
8118 {
8119         if (priv != DTRACE_PRIV_ALL) {
8120                 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8121                 uint32_t match = priv & ppriv;
8122
8123                 /*
8124                  * No PRIV_DTRACE_* privileges...
8125                  */
8126                 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8127                     DTRACE_PRIV_KERNEL)) == 0)
8128                         return (0);
8129
8130                 /*
8131                  * No matching bits, but there were bits to match...
8132                  */
8133                 if (match == 0 && ppriv != 0)
8134                         return (0);
8135
8136                 /*
8137                  * Need to have permissions to the process, but don't...
8138                  */
8139                 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8140                     uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8141                         return (0);
8142                 }
8143
8144                 /*
8145                  * Need to be in the same zone unless we possess the
8146                  * privilege to examine all zones.
8147                  */
8148                 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8149                     zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8150                         return (0);
8151                 }
8152         }
8153
8154         return (1);
8155 }
8156
8157 /*
8158  * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8159  * consists of input pattern strings and an ops-vector to evaluate them.
8160  * This function returns >0 for match, 0 for no match, and <0 for error.
8161  */
8162 static int
8163 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8164     uint32_t priv, uid_t uid, zoneid_t zoneid)
8165 {
8166         dtrace_provider_t *pvp = prp->dtpr_provider;
8167         int rv;
8168
8169         if (pvp->dtpv_defunct)
8170                 return (0);
8171
8172         if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8173                 return (rv);
8174
8175         if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8176                 return (rv);
8177
8178         if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8179                 return (rv);
8180
8181         if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8182                 return (rv);
8183
8184         if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8185                 return (0);
8186
8187         return (rv);
8188 }
8189
8190 /*
8191  * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8192  * interface for matching a glob pattern 'p' to an input string 's'.  Unlike
8193  * libc's version, the kernel version only applies to 8-bit ASCII strings.
8194  * In addition, all of the recursion cases except for '*' matching have been
8195  * unwound.  For '*', we still implement recursive evaluation, but a depth
8196  * counter is maintained and matching is aborted if we recurse too deep.
8197  * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8198  */
8199 static int
8200 dtrace_match_glob(const char *s, const char *p, int depth)
8201 {
8202         const char *olds;
8203         char s1, c;
8204         int gs;
8205
8206         if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8207                 return (-1);
8208
8209         if (s == NULL)
8210                 s = ""; /* treat NULL as empty string */
8211
8212 top:
8213         olds = s;
8214         s1 = *s++;
8215
8216         if (p == NULL)
8217                 return (0);
8218
8219         if ((c = *p++) == '\0')
8220                 return (s1 == '\0');
8221
8222         switch (c) {
8223         case '[': {
8224                 int ok = 0, notflag = 0;
8225                 char lc = '\0';
8226
8227                 if (s1 == '\0')
8228                         return (0);
8229
8230                 if (*p == '!') {
8231                         notflag = 1;
8232                         p++;
8233                 }
8234
8235                 if ((c = *p++) == '\0')
8236                         return (0);
8237
8238                 do {
8239                         if (c == '-' && lc != '\0' && *p != ']') {
8240                                 if ((c = *p++) == '\0')
8241                                         return (0);
8242                                 if (c == '\\' && (c = *p++) == '\0')
8243                                         return (0);
8244
8245                                 if (notflag) {
8246                                         if (s1 < lc || s1 > c)
8247                                                 ok++;
8248                                         else
8249                                                 return (0);
8250                                 } else if (lc <= s1 && s1 <= c)
8251                                         ok++;
8252
8253                         } else if (c == '\\' && (c = *p++) == '\0')
8254                                 return (0);
8255
8256                         lc = c; /* save left-hand 'c' for next iteration */
8257
8258                         if (notflag) {
8259                                 if (s1 != c)
8260                                         ok++;
8261                                 else
8262                                         return (0);
8263                         } else if (s1 == c)
8264                                 ok++;
8265
8266                         if ((c = *p++) == '\0')
8267                                 return (0);
8268
8269                 } while (c != ']');
8270
8271                 if (ok)
8272                         goto top;
8273
8274                 return (0);
8275         }
8276
8277         case '\\':
8278                 if ((c = *p++) == '\0')
8279                         return (0);
8280                 /*FALLTHRU*/
8281
8282         default:
8283                 if (c != s1)
8284                         return (0);
8285                 /*FALLTHRU*/
8286
8287         case '?':
8288                 if (s1 != '\0')
8289                         goto top;
8290                 return (0);
8291
8292         case '*':
8293                 while (*p == '*')
8294                         p++; /* consecutive *'s are identical to a single one */
8295
8296                 if (*p == '\0')
8297                         return (1);
8298
8299                 for (s = olds; *s != '\0'; s++) {
8300                         if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8301                                 return (gs);
8302                 }
8303
8304                 return (0);
8305         }
8306 }
8307
8308 /*ARGSUSED*/
8309 static int
8310 dtrace_match_string(const char *s, const char *p, int depth)
8311 {
8312         return (s != NULL && strcmp(s, p) == 0);
8313 }
8314
8315 /*ARGSUSED*/
8316 static int
8317 dtrace_match_nul(const char *s, const char *p, int depth)
8318 {
8319         return (1); /* always match the empty pattern */
8320 }
8321
8322 /*ARGSUSED*/
8323 static int
8324 dtrace_match_nonzero(const char *s, const char *p, int depth)
8325 {
8326         return (s != NULL && s[0] != '\0');
8327 }
8328
8329 static int
8330 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8331     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *), void *arg)
8332 {
8333         dtrace_probe_t template, *probe;
8334         dtrace_hash_t *hash = NULL;
8335         int len, best = INT_MAX, nmatched = 0;
8336         dtrace_id_t i;
8337
8338         ASSERT(MUTEX_HELD(&dtrace_lock));
8339
8340         /*
8341          * If the probe ID is specified in the key, just lookup by ID and
8342          * invoke the match callback once if a matching probe is found.
8343          */
8344         if (pkp->dtpk_id != DTRACE_IDNONE) {
8345                 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8346                     dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8347                         (void) (*matched)(probe, arg);
8348                         nmatched++;
8349                 }
8350                 return (nmatched);
8351         }
8352
8353         template.dtpr_mod = (char *)pkp->dtpk_mod;
8354         template.dtpr_func = (char *)pkp->dtpk_func;
8355         template.dtpr_name = (char *)pkp->dtpk_name;
8356
8357         /*
8358          * We want to find the most distinct of the module name, function
8359          * name, and name.  So for each one that is not a glob pattern or
8360          * empty string, we perform a lookup in the corresponding hash and
8361          * use the hash table with the fewest collisions to do our search.
8362          */
8363         if (pkp->dtpk_mmatch == &dtrace_match_string &&
8364             (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8365                 best = len;
8366                 hash = dtrace_bymod;
8367         }
8368
8369         if (pkp->dtpk_fmatch == &dtrace_match_string &&
8370             (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8371                 best = len;
8372                 hash = dtrace_byfunc;
8373         }
8374
8375         if (pkp->dtpk_nmatch == &dtrace_match_string &&
8376             (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8377                 best = len;
8378                 hash = dtrace_byname;
8379         }
8380
8381         /*
8382          * If we did not select a hash table, iterate over every probe and
8383          * invoke our callback for each one that matches our input probe key.
8384          */
8385         if (hash == NULL) {
8386                 for (i = 0; i < dtrace_nprobes; i++) {
8387                         if ((probe = dtrace_probes[i]) == NULL ||
8388                             dtrace_match_probe(probe, pkp, priv, uid,
8389                             zoneid) <= 0)
8390                                 continue;
8391
8392                         nmatched++;
8393
8394                         if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8395                                 break;
8396                 }
8397
8398                 return (nmatched);
8399         }
8400
8401         /*
8402          * If we selected a hash table, iterate over each probe of the same key
8403          * name and invoke the callback for every probe that matches the other
8404          * attributes of our input probe key.
8405          */
8406         for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8407             probe = *(DTRACE_HASHNEXT(hash, probe))) {
8408
8409                 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8410                         continue;
8411
8412                 nmatched++;
8413
8414                 if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
8415                         break;
8416         }
8417
8418         return (nmatched);
8419 }
8420
8421 /*
8422  * Return the function pointer dtrace_probecmp() should use to compare the
8423  * specified pattern with a string.  For NULL or empty patterns, we select
8424  * dtrace_match_nul().  For glob pattern strings, we use dtrace_match_glob().
8425  * For non-empty non-glob strings, we use dtrace_match_string().
8426  */
8427 static dtrace_probekey_f *
8428 dtrace_probekey_func(const char *p)
8429 {
8430         char c;
8431
8432         if (p == NULL || *p == '\0')
8433                 return (&dtrace_match_nul);
8434
8435         while ((c = *p++) != '\0') {
8436                 if (c == '[' || c == '?' || c == '*' || c == '\\')
8437                         return (&dtrace_match_glob);
8438         }
8439
8440         return (&dtrace_match_string);
8441 }
8442
8443 /*
8444  * Build a probe comparison key for use with dtrace_match_probe() from the
8445  * given probe description.  By convention, a null key only matches anchored
8446  * probes: if each field is the empty string, reset dtpk_fmatch to
8447  * dtrace_match_nonzero().
8448  */
8449 static void
8450 dtrace_probekey(dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8451 {
8452         pkp->dtpk_prov = pdp->dtpd_provider;
8453         pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8454
8455         pkp->dtpk_mod = pdp->dtpd_mod;
8456         pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
8457
8458         pkp->dtpk_func = pdp->dtpd_func;
8459         pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8460
8461         pkp->dtpk_name = pdp->dtpd_name;
8462         pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8463
8464         pkp->dtpk_id = pdp->dtpd_id;
8465
8466         if (pkp->dtpk_id == DTRACE_IDNONE &&
8467             pkp->dtpk_pmatch == &dtrace_match_nul &&
8468             pkp->dtpk_mmatch == &dtrace_match_nul &&
8469             pkp->dtpk_fmatch == &dtrace_match_nul &&
8470             pkp->dtpk_nmatch == &dtrace_match_nul)
8471                 pkp->dtpk_fmatch = &dtrace_match_nonzero;
8472 }
8473
8474 /*
8475  * DTrace Provider-to-Framework API Functions
8476  *
8477  * These functions implement much of the Provider-to-Framework API, as
8478  * described in <sys/dtrace.h>.  The parts of the API not in this section are
8479  * the functions in the API for probe management (found below), and
8480  * dtrace_probe() itself (found above).
8481  */
8482
8483 /*
8484  * Register the calling provider with the DTrace framework.  This should
8485  * generally be called by DTrace providers in their attach(9E) entry point.
8486  */
8487 int
8488 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8489     cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8490 {
8491         dtrace_provider_t *provider;
8492
8493         if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8494                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8495                     "arguments", name ? name : "<NULL>");
8496                 return (EINVAL);
8497         }
8498
8499         if (name[0] == '\0' || dtrace_badname(name)) {
8500                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8501                     "provider name", name);
8502                 return (EINVAL);
8503         }
8504
8505         if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8506             pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8507             pops->dtps_destroy == NULL ||
8508             ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8509                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8510                     "provider ops", name);
8511                 return (EINVAL);
8512         }
8513
8514         if (dtrace_badattr(&pap->dtpa_provider) ||
8515             dtrace_badattr(&pap->dtpa_mod) ||
8516             dtrace_badattr(&pap->dtpa_func) ||
8517             dtrace_badattr(&pap->dtpa_name) ||
8518             dtrace_badattr(&pap->dtpa_args)) {
8519                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8520                     "provider attributes", name);
8521                 return (EINVAL);
8522         }
8523
8524         if (priv & ~DTRACE_PRIV_ALL) {
8525                 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8526                     "privilege attributes", name);
8527                 return (EINVAL);
8528         }
8529
8530         if ((priv & DTRACE_PRIV_KERNEL) &&
8531             (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8532             pops->dtps_usermode == NULL) {
8533                 cmn_err(CE_WARN, "failed to register provider '%s': need "
8534                     "dtps_usermode() op for given privilege attributes", name);
8535                 return (EINVAL);
8536         }
8537
8538         provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8539         provider->dtpv_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
8540         (void) strcpy(provider->dtpv_name, name);
8541
8542         provider->dtpv_attr = *pap;
8543         provider->dtpv_priv.dtpp_flags = priv;
8544         if (cr != NULL) {
8545                 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8546                 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8547         }
8548         provider->dtpv_pops = *pops;
8549
8550         if (pops->dtps_provide == NULL) {
8551                 ASSERT(pops->dtps_provide_module != NULL);
8552                 provider->dtpv_pops.dtps_provide =
8553                     (void (*)(void *, dtrace_probedesc_t *))dtrace_nullop;
8554         }
8555
8556         if (pops->dtps_provide_module == NULL) {
8557                 ASSERT(pops->dtps_provide != NULL);
8558                 provider->dtpv_pops.dtps_provide_module =
8559                     (void (*)(void *, modctl_t *))dtrace_nullop;
8560         }
8561
8562         if (pops->dtps_suspend == NULL) {
8563                 ASSERT(pops->dtps_resume == NULL);
8564                 provider->dtpv_pops.dtps_suspend =
8565                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8566                 provider->dtpv_pops.dtps_resume =
8567                     (void (*)(void *, dtrace_id_t, void *))dtrace_nullop;
8568         }
8569
8570         provider->dtpv_arg = arg;
8571         *idp = (dtrace_provider_id_t)provider;
8572
8573         if (pops == &dtrace_provider_ops) {
8574                 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8575                 ASSERT(MUTEX_HELD(&dtrace_lock));
8576                 ASSERT(dtrace_anon.dta_enabling == NULL);
8577
8578                 /*
8579                  * We make sure that the DTrace provider is at the head of
8580                  * the provider chain.
8581                  */
8582                 provider->dtpv_next = dtrace_provider;
8583                 dtrace_provider = provider;
8584                 return (0);
8585         }
8586
8587         mutex_enter(&dtrace_provider_lock);
8588         mutex_enter(&dtrace_lock);
8589
8590         /*
8591          * If there is at least one provider registered, we'll add this
8592          * provider after the first provider.
8593          */
8594         if (dtrace_provider != NULL) {
8595                 provider->dtpv_next = dtrace_provider->dtpv_next;
8596                 dtrace_provider->dtpv_next = provider;
8597         } else {
8598                 dtrace_provider = provider;
8599         }
8600
8601         if (dtrace_retained != NULL) {
8602                 dtrace_enabling_provide(provider);
8603
8604                 /*
8605                  * Now we need to call dtrace_enabling_matchall() -- which
8606                  * will acquire cpu_lock and dtrace_lock.  We therefore need
8607                  * to drop all of our locks before calling into it...
8608                  */
8609                 mutex_exit(&dtrace_lock);
8610                 mutex_exit(&dtrace_provider_lock);
8611                 dtrace_enabling_matchall();
8612
8613                 return (0);
8614         }
8615
8616         mutex_exit(&dtrace_lock);
8617         mutex_exit(&dtrace_provider_lock);
8618
8619         return (0);
8620 }
8621
8622 /*
8623  * Unregister the specified provider from the DTrace framework.  This should
8624  * generally be called by DTrace providers in their detach(9E) entry point.
8625  */
8626 int
8627 dtrace_unregister(dtrace_provider_id_t id)
8628 {
8629         dtrace_provider_t *old = (dtrace_provider_t *)id;
8630         dtrace_provider_t *prev = NULL;
8631         int i, self = 0, noreap = 0;
8632         dtrace_probe_t *probe, *first = NULL;
8633
8634         if (old->dtpv_pops.dtps_enable ==
8635             (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
8636                 /*
8637                  * If DTrace itself is the provider, we're called with locks
8638                  * already held.
8639                  */
8640                 ASSERT(old == dtrace_provider);
8641 #if defined(sun)
8642                 ASSERT(dtrace_devi != NULL);
8643 #endif
8644                 ASSERT(MUTEX_HELD(&dtrace_provider_lock));
8645                 ASSERT(MUTEX_HELD(&dtrace_lock));
8646                 self = 1;
8647
8648                 if (dtrace_provider->dtpv_next != NULL) {
8649                         /*
8650                          * There's another provider here; return failure.
8651                          */
8652                         return (EBUSY);
8653                 }
8654         } else {
8655                 mutex_enter(&dtrace_provider_lock);
8656 #if defined(sun)
8657                 mutex_enter(&mod_lock);
8658 #endif
8659                 mutex_enter(&dtrace_lock);
8660         }
8661
8662         /*
8663          * If anyone has /dev/dtrace open, or if there are anonymous enabled
8664          * probes, we refuse to let providers slither away, unless this
8665          * provider has already been explicitly invalidated.
8666          */
8667         if (!old->dtpv_defunct &&
8668             (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8669             dtrace_anon.dta_state->dts_necbs > 0))) {
8670                 if (!self) {
8671                         mutex_exit(&dtrace_lock);
8672 #if defined(sun)
8673                         mutex_exit(&mod_lock);
8674 #endif
8675                         mutex_exit(&dtrace_provider_lock);
8676                 }
8677                 return (EBUSY);
8678         }
8679
8680         /*
8681          * Attempt to destroy the probes associated with this provider.
8682          */
8683         for (i = 0; i < dtrace_nprobes; i++) {
8684                 if ((probe = dtrace_probes[i]) == NULL)
8685                         continue;
8686
8687                 if (probe->dtpr_provider != old)
8688                         continue;
8689
8690                 if (probe->dtpr_ecb == NULL)
8691                         continue;
8692
8693                 /*
8694                  * If we are trying to unregister a defunct provider, and the
8695                  * provider was made defunct within the interval dictated by
8696                  * dtrace_unregister_defunct_reap, we'll (asynchronously)
8697                  * attempt to reap our enablings.  To denote that the provider
8698                  * should reattempt to unregister itself at some point in the
8699                  * future, we will return a differentiable error code (EAGAIN
8700                  * instead of EBUSY) in this case.
8701                  */
8702                 if (dtrace_gethrtime() - old->dtpv_defunct >
8703                     dtrace_unregister_defunct_reap)
8704                         noreap = 1;
8705
8706                 if (!self) {
8707                         mutex_exit(&dtrace_lock);
8708 #if defined(sun)
8709                         mutex_exit(&mod_lock);
8710 #endif
8711                         mutex_exit(&dtrace_provider_lock);
8712                 }
8713
8714                 if (noreap)
8715                         return (EBUSY);
8716
8717                 (void) taskq_dispatch(dtrace_taskq,
8718                     (task_func_t *)dtrace_enabling_reap, NULL, TQ_SLEEP);
8719
8720                 return (EAGAIN);
8721         }
8722
8723         /*
8724          * All of the probes for this provider are disabled; we can safely
8725          * remove all of them from their hash chains and from the probe array.
8726          */
8727         for (i = 0; i < dtrace_nprobes; i++) {
8728                 if ((probe = dtrace_probes[i]) == NULL)
8729                         continue;
8730
8731                 if (probe->dtpr_provider != old)
8732                         continue;
8733
8734                 dtrace_probes[i] = NULL;
8735
8736                 dtrace_hash_remove(dtrace_bymod, probe);
8737                 dtrace_hash_remove(dtrace_byfunc, probe);
8738                 dtrace_hash_remove(dtrace_byname, probe);
8739
8740                 if (first == NULL) {
8741                         first = probe;
8742                         probe->dtpr_nextmod = NULL;
8743                 } else {
8744                         probe->dtpr_nextmod = first;
8745                         first = probe;
8746                 }
8747         }
8748
8749         /*
8750          * The provider's probes have been removed from the hash chains and
8751          * from the probe array.  Now issue a dtrace_sync() to be sure that
8752          * everyone has cleared out from any probe array processing.
8753          */
8754         dtrace_sync();
8755
8756         for (probe = first; probe != NULL; probe = first) {
8757                 first = probe->dtpr_nextmod;
8758
8759                 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8760                     probe->dtpr_arg);
8761                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8762                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8763                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8764 #if defined(sun)
8765                 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8766 #else
8767                 free_unr(dtrace_arena, probe->dtpr_id);
8768 #endif
8769                 kmem_free(probe, sizeof (dtrace_probe_t));
8770         }
8771
8772         if ((prev = dtrace_provider) == old) {
8773 #if defined(sun)
8774                 ASSERT(self || dtrace_devi == NULL);
8775                 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8776 #endif
8777                 dtrace_provider = old->dtpv_next;
8778         } else {
8779                 while (prev != NULL && prev->dtpv_next != old)
8780                         prev = prev->dtpv_next;
8781
8782                 if (prev == NULL) {
8783                         panic("attempt to unregister non-existent "
8784                             "dtrace provider %p\n", (void *)id);
8785                 }
8786
8787                 prev->dtpv_next = old->dtpv_next;
8788         }
8789
8790         if (!self) {
8791                 mutex_exit(&dtrace_lock);
8792 #if defined(sun)
8793                 mutex_exit(&mod_lock);
8794 #endif
8795                 mutex_exit(&dtrace_provider_lock);
8796         }
8797
8798         kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
8799         kmem_free(old, sizeof (dtrace_provider_t));
8800
8801         return (0);
8802 }
8803
8804 /*
8805  * Invalidate the specified provider.  All subsequent probe lookups for the
8806  * specified provider will fail, but its probes will not be removed.
8807  */
8808 void
8809 dtrace_invalidate(dtrace_provider_id_t id)
8810 {
8811         dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8812
8813         ASSERT(pvp->dtpv_pops.dtps_enable !=
8814             (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8815
8816         mutex_enter(&dtrace_provider_lock);
8817         mutex_enter(&dtrace_lock);
8818
8819         pvp->dtpv_defunct = dtrace_gethrtime();
8820
8821         mutex_exit(&dtrace_lock);
8822         mutex_exit(&dtrace_provider_lock);
8823 }
8824
8825 /*
8826  * Indicate whether or not DTrace has attached.
8827  */
8828 int
8829 dtrace_attached(void)
8830 {
8831         /*
8832          * dtrace_provider will be non-NULL iff the DTrace driver has
8833          * attached.  (It's non-NULL because DTrace is always itself a
8834          * provider.)
8835          */
8836         return (dtrace_provider != NULL);
8837 }
8838
8839 /*
8840  * Remove all the unenabled probes for the given provider.  This function is
8841  * not unlike dtrace_unregister(), except that it doesn't remove the provider
8842  * -- just as many of its associated probes as it can.
8843  */
8844 int
8845 dtrace_condense(dtrace_provider_id_t id)
8846 {
8847         dtrace_provider_t *prov = (dtrace_provider_t *)id;
8848         int i;
8849         dtrace_probe_t *probe;
8850
8851         /*
8852          * Make sure this isn't the dtrace provider itself.
8853          */
8854         ASSERT(prov->dtpv_pops.dtps_enable !=
8855             (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
8856
8857         mutex_enter(&dtrace_provider_lock);
8858         mutex_enter(&dtrace_lock);
8859
8860         /*
8861          * Attempt to destroy the probes associated with this provider.
8862          */
8863         for (i = 0; i < dtrace_nprobes; i++) {
8864                 if ((probe = dtrace_probes[i]) == NULL)
8865                         continue;
8866
8867                 if (probe->dtpr_provider != prov)
8868                         continue;
8869
8870                 if (probe->dtpr_ecb != NULL)
8871                         continue;
8872
8873                 dtrace_probes[i] = NULL;
8874
8875                 dtrace_hash_remove(dtrace_bymod, probe);
8876                 dtrace_hash_remove(dtrace_byfunc, probe);
8877                 dtrace_hash_remove(dtrace_byname, probe);
8878
8879                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
8880                     probe->dtpr_arg);
8881                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
8882                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
8883                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
8884                 kmem_free(probe, sizeof (dtrace_probe_t));
8885 #if defined(sun)
8886                 vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
8887 #else
8888                 free_unr(dtrace_arena, i + 1);
8889 #endif
8890         }
8891
8892         mutex_exit(&dtrace_lock);
8893         mutex_exit(&dtrace_provider_lock);
8894
8895         return (0);
8896 }
8897
8898 /*
8899  * DTrace Probe Management Functions
8900  *
8901  * The functions in this section perform the DTrace probe management,
8902  * including functions to create probes, look-up probes, and call into the
8903  * providers to request that probes be provided.  Some of these functions are
8904  * in the Provider-to-Framework API; these functions can be identified by the
8905  * fact that they are not declared "static".
8906  */
8907
8908 /*
8909  * Create a probe with the specified module name, function name, and name.
8910  */
8911 dtrace_id_t
8912 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
8913     const char *func, const char *name, int aframes, void *arg)
8914 {
8915         dtrace_probe_t *probe, **probes;
8916         dtrace_provider_t *provider = (dtrace_provider_t *)prov;
8917         dtrace_id_t id;
8918
8919         if (provider == dtrace_provider) {
8920                 ASSERT(MUTEX_HELD(&dtrace_lock));
8921         } else {
8922                 mutex_enter(&dtrace_lock);
8923         }
8924
8925 #if defined(sun)
8926         id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
8927             VM_BESTFIT | VM_SLEEP);
8928 #else
8929         id = alloc_unr(dtrace_arena);
8930 #endif
8931         probe = kmem_zalloc(sizeof (dtrace_probe_t), KM_SLEEP);
8932
8933         probe->dtpr_id = id;
8934         probe->dtpr_gen = dtrace_probegen++;
8935         probe->dtpr_mod = dtrace_strdup(mod);
8936         probe->dtpr_func = dtrace_strdup(func);
8937         probe->dtpr_name = dtrace_strdup(name);
8938         probe->dtpr_arg = arg;
8939         probe->dtpr_aframes = aframes;
8940         probe->dtpr_provider = provider;
8941
8942         dtrace_hash_add(dtrace_bymod, probe);
8943         dtrace_hash_add(dtrace_byfunc, probe);
8944         dtrace_hash_add(dtrace_byname, probe);
8945
8946         if (id - 1 >= dtrace_nprobes) {
8947                 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
8948                 size_t nsize = osize << 1;
8949
8950                 if (nsize == 0) {
8951                         ASSERT(osize == 0);
8952                         ASSERT(dtrace_probes == NULL);
8953                         nsize = sizeof (dtrace_probe_t *);
8954                 }
8955
8956                 probes = kmem_zalloc(nsize, KM_SLEEP);
8957
8958                 if (dtrace_probes == NULL) {
8959                         ASSERT(osize == 0);
8960                         dtrace_probes = probes;
8961                         dtrace_nprobes = 1;
8962                 } else {
8963                         dtrace_probe_t **oprobes = dtrace_probes;
8964
8965                         bcopy(oprobes, probes, osize);
8966                         dtrace_membar_producer();
8967                         dtrace_probes = probes;
8968
8969                         dtrace_sync();
8970
8971                         /*
8972                          * All CPUs are now seeing the new probes array; we can
8973                          * safely free the old array.
8974                          */
8975                         kmem_free(oprobes, osize);
8976                         dtrace_nprobes <<= 1;
8977                 }
8978
8979                 ASSERT(id - 1 < dtrace_nprobes);
8980         }
8981
8982         ASSERT(dtrace_probes[id - 1] == NULL);
8983         dtrace_probes[id - 1] = probe;
8984
8985         if (provider != dtrace_provider)
8986                 mutex_exit(&dtrace_lock);
8987
8988         return (id);
8989 }
8990
8991 static dtrace_probe_t *
8992 dtrace_probe_lookup_id(dtrace_id_t id)
8993 {
8994         ASSERT(MUTEX_HELD(&dtrace_lock));
8995
8996         if (id == 0 || id > dtrace_nprobes)
8997                 return (NULL);
8998
8999         return (dtrace_probes[id - 1]);
9000 }
9001
9002 static int
9003 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg)
9004 {
9005         *((dtrace_id_t *)arg) = probe->dtpr_id;
9006
9007         return (DTRACE_MATCH_DONE);
9008 }
9009
9010 /*
9011  * Look up a probe based on provider and one or more of module name, function
9012  * name and probe name.
9013  */
9014 dtrace_id_t
9015 dtrace_probe_lookup(dtrace_provider_id_t prid, char *mod,
9016     char *func, char *name)
9017 {
9018         dtrace_probekey_t pkey;
9019         dtrace_id_t id;
9020         int match;
9021
9022         pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
9023         pkey.dtpk_pmatch = &dtrace_match_string;
9024         pkey.dtpk_mod = mod;
9025         pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9026         pkey.dtpk_func = func;
9027         pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9028         pkey.dtpk_name = name;
9029         pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9030         pkey.dtpk_id = DTRACE_IDNONE;
9031
9032         mutex_enter(&dtrace_lock);
9033         match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9034             dtrace_probe_lookup_match, &id);
9035         mutex_exit(&dtrace_lock);
9036
9037         ASSERT(match == 1 || match == 0);
9038         return (match ? id : 0);
9039 }
9040
9041 /*
9042  * Returns the probe argument associated with the specified probe.
9043  */
9044 void *
9045 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9046 {
9047         dtrace_probe_t *probe;
9048         void *rval = NULL;
9049
9050         mutex_enter(&dtrace_lock);
9051
9052         if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9053             probe->dtpr_provider == (dtrace_provider_t *)id)
9054                 rval = probe->dtpr_arg;
9055
9056         mutex_exit(&dtrace_lock);
9057
9058         return (rval);
9059 }
9060
9061 /*
9062  * Copy a probe into a probe description.
9063  */
9064 static void
9065 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9066 {
9067         bzero(pdp, sizeof (dtrace_probedesc_t));
9068         pdp->dtpd_id = prp->dtpr_id;
9069
9070         (void) strncpy(pdp->dtpd_provider,
9071             prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN - 1);
9072
9073         (void) strncpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN - 1);
9074         (void) strncpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN - 1);
9075         (void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
9076 }
9077
9078 /*
9079  * Called to indicate that a probe -- or probes -- should be provided by a
9080  * specfied provider.  If the specified description is NULL, the provider will
9081  * be told to provide all of its probes.  (This is done whenever a new
9082  * consumer comes along, or whenever a retained enabling is to be matched.) If
9083  * the specified description is non-NULL, the provider is given the
9084  * opportunity to dynamically provide the specified probe, allowing providers
9085  * to support the creation of probes on-the-fly.  (So-called _autocreated_
9086  * probes.)  If the provider is NULL, the operations will be applied to all
9087  * providers; if the provider is non-NULL the operations will only be applied
9088  * to the specified provider.  The dtrace_provider_lock must be held, and the
9089  * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9090  * will need to grab the dtrace_lock when it reenters the framework through
9091  * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9092  */
9093 static void
9094 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9095 {
9096 #if defined(sun)
9097         modctl_t *ctl;
9098 #endif
9099         int all = 0;
9100
9101         ASSERT(MUTEX_HELD(&dtrace_provider_lock));
9102
9103         if (prv == NULL) {
9104                 all = 1;
9105                 prv = dtrace_provider;
9106         }
9107
9108         do {
9109                 /*
9110                  * First, call the blanket provide operation.
9111                  */
9112                 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9113
9114 #if defined(sun)
9115                 /*
9116                  * Now call the per-module provide operation.  We will grab
9117                  * mod_lock to prevent the list from being modified.  Note
9118                  * that this also prevents the mod_busy bits from changing.
9119                  * (mod_busy can only be changed with mod_lock held.)
9120                  */
9121                 mutex_enter(&mod_lock);
9122
9123                 ctl = &modules;
9124                 do {
9125                         if (ctl->mod_busy || ctl->mod_mp == NULL)
9126                                 continue;
9127
9128                         prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9129
9130                 } while ((ctl = ctl->mod_next) != &modules);
9131
9132                 mutex_exit(&mod_lock);
9133 #endif
9134         } while (all && (prv = prv->dtpv_next) != NULL);
9135 }
9136
9137 #if defined(sun)
9138 /*
9139  * Iterate over each probe, and call the Framework-to-Provider API function
9140  * denoted by offs.
9141  */
9142 static void
9143 dtrace_probe_foreach(uintptr_t offs)
9144 {
9145         dtrace_provider_t *prov;
9146         void (*func)(void *, dtrace_id_t, void *);
9147         dtrace_probe_t *probe;
9148         dtrace_icookie_t cookie;
9149         int i;
9150
9151         /*
9152          * We disable interrupts to walk through the probe array.  This is
9153          * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9154          * won't see stale data.
9155          */
9156         cookie = dtrace_interrupt_disable();
9157
9158         for (i = 0; i < dtrace_nprobes; i++) {
9159                 if ((probe = dtrace_probes[i]) == NULL)
9160                         continue;
9161
9162                 if (probe->dtpr_ecb == NULL) {
9163                         /*
9164                          * This probe isn't enabled -- don't call the function.
9165                          */
9166                         continue;
9167                 }
9168
9169                 prov = probe->dtpr_provider;
9170                 func = *((void(**)(void *, dtrace_id_t, void *))
9171                     ((uintptr_t)&prov->dtpv_pops + offs));
9172
9173                 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9174         }
9175
9176         dtrace_interrupt_enable(cookie);
9177 }
9178 #endif
9179
9180 static int
9181 dtrace_probe_enable(dtrace_probedesc_t *desc, dtrace_enabling_t *enab)
9182 {
9183         dtrace_probekey_t pkey;
9184         uint32_t priv;
9185         uid_t uid;
9186         zoneid_t zoneid;
9187
9188         ASSERT(MUTEX_HELD(&dtrace_lock));
9189         dtrace_ecb_create_cache = NULL;
9190
9191         if (desc == NULL) {
9192                 /*
9193                  * If we're passed a NULL description, we're being asked to
9194                  * create an ECB with a NULL probe.
9195                  */
9196                 (void) dtrace_ecb_create_enable(NULL, enab);
9197                 return (0);
9198         }
9199
9200         dtrace_probekey(desc, &pkey);
9201         dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9202             &priv, &uid, &zoneid);
9203
9204         return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
9205             enab));
9206 }
9207
9208 /*
9209  * DTrace Helper Provider Functions
9210  */
9211 static void
9212 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9213 {
9214         attr->dtat_name = DOF_ATTR_NAME(dofattr);
9215         attr->dtat_data = DOF_ATTR_DATA(dofattr);
9216         attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9217 }
9218
9219 static void
9220 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9221     const dof_provider_t *dofprov, char *strtab)
9222 {
9223         hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9224         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9225             dofprov->dofpv_provattr);
9226         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9227             dofprov->dofpv_modattr);
9228         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9229             dofprov->dofpv_funcattr);
9230         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9231             dofprov->dofpv_nameattr);
9232         dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9233             dofprov->dofpv_argsattr);
9234 }
9235
9236 static void
9237 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9238 {
9239         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9240         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9241         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9242         dof_provider_t *provider;
9243         dof_probe_t *probe;
9244         uint32_t *off, *enoff;
9245         uint8_t *arg;
9246         char *strtab;
9247         uint_t i, nprobes;
9248         dtrace_helper_provdesc_t dhpv;
9249         dtrace_helper_probedesc_t dhpb;
9250         dtrace_meta_t *meta = dtrace_meta_pid;
9251         dtrace_mops_t *mops = &meta->dtm_mops;
9252         void *parg;
9253
9254         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9255         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9256             provider->dofpv_strtab * dof->dofh_secsize);
9257         prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9258             provider->dofpv_probes * dof->dofh_secsize);
9259         arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9260             provider->dofpv_prargs * dof->dofh_secsize);
9261         off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9262             provider->dofpv_proffs * dof->dofh_secsize);
9263
9264         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9265         off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9266         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9267         enoff = NULL;
9268
9269         /*
9270          * See dtrace_helper_provider_validate().
9271          */
9272         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9273             provider->dofpv_prenoffs != DOF_SECT_NONE) {
9274                 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9275                     provider->dofpv_prenoffs * dof->dofh_secsize);
9276                 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9277         }
9278
9279         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9280
9281         /*
9282          * Create the provider.
9283          */
9284         dtrace_dofprov2hprov(&dhpv, provider, strtab);
9285
9286         if ((parg = mops->dtms_provide_pid(meta->dtm_arg, &dhpv, pid)) == NULL)
9287                 return;
9288
9289         meta->dtm_count++;
9290
9291         /*
9292          * Create the probes.
9293          */
9294         for (i = 0; i < nprobes; i++) {
9295                 probe = (dof_probe_t *)(uintptr_t)(daddr +
9296                     prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9297
9298                 dhpb.dthpb_mod = dhp->dofhp_mod;
9299                 dhpb.dthpb_func = strtab + probe->dofpr_func;
9300                 dhpb.dthpb_name = strtab + probe->dofpr_name;
9301                 dhpb.dthpb_base = probe->dofpr_addr;
9302                 dhpb.dthpb_offs = off + probe->dofpr_offidx;
9303                 dhpb.dthpb_noffs = probe->dofpr_noffs;
9304                 if (enoff != NULL) {
9305                         dhpb.dthpb_enoffs = enoff + probe->dofpr_enoffidx;
9306                         dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9307                 } else {
9308                         dhpb.dthpb_enoffs = NULL;
9309                         dhpb.dthpb_nenoffs = 0;
9310                 }
9311                 dhpb.dthpb_args = arg + probe->dofpr_argidx;
9312                 dhpb.dthpb_nargc = probe->dofpr_nargc;
9313                 dhpb.dthpb_xargc = probe->dofpr_xargc;
9314                 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9315                 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9316
9317                 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9318         }
9319 }
9320
9321 static void
9322 dtrace_helper_provide(dof_helper_t *dhp, pid_t pid)
9323 {
9324         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9325         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9326         int i;
9327
9328         ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9329
9330         for (i = 0; i < dof->dofh_secnum; i++) {
9331                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9332                     dof->dofh_secoff + i * dof->dofh_secsize);
9333
9334                 if (sec->dofs_type != DOF_SECT_PROVIDER)
9335                         continue;
9336
9337                 dtrace_helper_provide_one(dhp, sec, pid);
9338         }
9339
9340         /*
9341          * We may have just created probes, so we must now rematch against
9342          * any retained enablings.  Note that this call will acquire both
9343          * cpu_lock and dtrace_lock; the fact that we are holding
9344          * dtrace_meta_lock now is what defines the ordering with respect to
9345          * these three locks.
9346          */
9347         dtrace_enabling_matchall();
9348 }
9349
9350 static void
9351 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, pid_t pid)
9352 {
9353         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9354         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9355         dof_sec_t *str_sec;
9356         dof_provider_t *provider;
9357         char *strtab;
9358         dtrace_helper_provdesc_t dhpv;
9359         dtrace_meta_t *meta = dtrace_meta_pid;
9360         dtrace_mops_t *mops = &meta->dtm_mops;
9361
9362         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9363         str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9364             provider->dofpv_strtab * dof->dofh_secsize);
9365
9366         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9367
9368         /*
9369          * Create the provider.
9370          */
9371         dtrace_dofprov2hprov(&dhpv, provider, strtab);
9372
9373         mops->dtms_remove_pid(meta->dtm_arg, &dhpv, pid);
9374
9375         meta->dtm_count--;
9376 }
9377
9378 static void
9379 dtrace_helper_provider_remove(dof_helper_t *dhp, pid_t pid)
9380 {
9381         uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9382         dof_hdr_t *dof = (dof_hdr_t *)daddr;
9383         int i;
9384
9385         ASSERT(MUTEX_HELD(&dtrace_meta_lock));
9386
9387         for (i = 0; i < dof->dofh_secnum; i++) {
9388                 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9389                     dof->dofh_secoff + i * dof->dofh_secsize);
9390
9391                 if (sec->dofs_type != DOF_SECT_PROVIDER)
9392                         continue;
9393
9394                 dtrace_helper_provider_remove_one(dhp, sec, pid);
9395         }
9396 }
9397
9398 /*
9399  * DTrace Meta Provider-to-Framework API Functions
9400  *
9401  * These functions implement the Meta Provider-to-Framework API, as described
9402  * in <sys/dtrace.h>.
9403  */
9404 int
9405 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9406     dtrace_meta_provider_id_t *idp)
9407 {
9408         dtrace_meta_t *meta;
9409         dtrace_helpers_t *help, *next;
9410         int i;
9411
9412         *idp = DTRACE_METAPROVNONE;
9413
9414         /*
9415          * We strictly don't need the name, but we hold onto it for
9416          * debuggability. All hail error queues!
9417          */
9418         if (name == NULL) {
9419                 cmn_err(CE_WARN, "failed to register meta-provider: "
9420                     "invalid name");
9421                 return (EINVAL);
9422         }
9423
9424         if (mops == NULL ||
9425             mops->dtms_create_probe == NULL ||
9426             mops->dtms_provide_pid == NULL ||
9427             mops->dtms_remove_pid == NULL) {
9428                 cmn_err(CE_WARN, "failed to register meta-register %s: "
9429                     "invalid ops", name);
9430                 return (EINVAL);
9431         }
9432
9433         meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9434         meta->dtm_mops = *mops;
9435         meta->dtm_name = kmem_alloc(strlen(name) + 1, KM_SLEEP);
9436         (void) strcpy(meta->dtm_name, name);
9437         meta->dtm_arg = arg;
9438
9439         mutex_enter(&dtrace_meta_lock);
9440         mutex_enter(&dtrace_lock);
9441
9442         if (dtrace_meta_pid != NULL) {
9443                 mutex_exit(&dtrace_lock);
9444                 mutex_exit(&dtrace_meta_lock);
9445                 cmn_err(CE_WARN, "failed to register meta-register %s: "
9446                     "user-land meta-provider exists", name);
9447                 kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
9448                 kmem_free(meta, sizeof (dtrace_meta_t));
9449                 return (EINVAL);
9450         }
9451
9452         dtrace_meta_pid = meta;
9453         *idp = (dtrace_meta_provider_id_t)meta;
9454
9455         /*
9456          * If there are providers and probes ready to go, pass them
9457          * off to the new meta provider now.
9458          */
9459
9460         help = dtrace_deferred_pid;
9461         dtrace_deferred_pid = NULL;
9462
9463         mutex_exit(&dtrace_lock);
9464
9465         while (help != NULL) {
9466                 for (i = 0; i < help->dthps_nprovs; i++) {
9467                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9468                             help->dthps_pid);
9469                 }
9470
9471                 next = help->dthps_next;
9472                 help->dthps_next = NULL;
9473                 help->dthps_prev = NULL;
9474                 help->dthps_deferred = 0;
9475                 help = next;
9476         }
9477
9478         mutex_exit(&dtrace_meta_lock);
9479
9480         return (0);
9481 }
9482
9483 int
9484 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9485 {
9486         dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9487
9488         mutex_enter(&dtrace_meta_lock);
9489         mutex_enter(&dtrace_lock);
9490
9491         if (old == dtrace_meta_pid) {
9492                 pp = &dtrace_meta_pid;
9493         } else {
9494                 panic("attempt to unregister non-existent "
9495                     "dtrace meta-provider %p\n", (void *)old);
9496         }
9497
9498         if (old->dtm_count != 0) {
9499                 mutex_exit(&dtrace_lock);
9500                 mutex_exit(&dtrace_meta_lock);
9501                 return (EBUSY);
9502         }
9503
9504         *pp = NULL;
9505
9506         mutex_exit(&dtrace_lock);
9507         mutex_exit(&dtrace_meta_lock);
9508
9509         kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
9510         kmem_free(old, sizeof (dtrace_meta_t));
9511
9512         return (0);
9513 }
9514
9515
9516 /*
9517  * DTrace DIF Object Functions
9518  */
9519 static int
9520 dtrace_difo_err(uint_t pc, const char *format, ...)
9521 {
9522         if (dtrace_err_verbose) {
9523                 va_list alist;
9524
9525                 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9526                 va_start(alist, format);
9527                 (void) vuprintf(format, alist);
9528                 va_end(alist);
9529         }
9530
9531 #ifdef DTRACE_ERRDEBUG
9532         dtrace_errdebug(format);
9533 #endif
9534         return (1);
9535 }
9536
9537 /*
9538  * Validate a DTrace DIF object by checking the IR instructions.  The following
9539  * rules are currently enforced by dtrace_difo_validate():
9540  *
9541  * 1. Each instruction must have a valid opcode
9542  * 2. Each register, string, variable, or subroutine reference must be valid
9543  * 3. No instruction can modify register %r0 (must be zero)
9544  * 4. All instruction reserved bits must be set to zero
9545  * 5. The last instruction must be a "ret" instruction
9546  * 6. All branch targets must reference a valid instruction _after_ the branch
9547  */
9548 static int
9549 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9550     cred_t *cr)
9551 {
9552         int err = 0, i;
9553         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9554         int kcheckload;
9555         uint_t pc;
9556
9557         kcheckload = cr == NULL ||
9558             (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9559
9560         dp->dtdo_destructive = 0;
9561
9562         for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9563                 dif_instr_t instr = dp->dtdo_buf[pc];
9564
9565                 uint_t r1 = DIF_INSTR_R1(instr);
9566                 uint_t r2 = DIF_INSTR_R2(instr);
9567                 uint_t rd = DIF_INSTR_RD(instr);
9568                 uint_t rs = DIF_INSTR_RS(instr);
9569                 uint_t label = DIF_INSTR_LABEL(instr);
9570                 uint_t v = DIF_INSTR_VAR(instr);
9571                 uint_t subr = DIF_INSTR_SUBR(instr);
9572                 uint_t type = DIF_INSTR_TYPE(instr);
9573                 uint_t op = DIF_INSTR_OP(instr);
9574
9575                 switch (op) {
9576                 case DIF_OP_OR:
9577                 case DIF_OP_XOR:
9578                 case DIF_OP_AND:
9579                 case DIF_OP_SLL:
9580                 case DIF_OP_SRL:
9581                 case DIF_OP_SRA:
9582                 case DIF_OP_SUB:
9583                 case DIF_OP_ADD:
9584                 case DIF_OP_MUL:
9585                 case DIF_OP_SDIV:
9586                 case DIF_OP_UDIV:
9587                 case DIF_OP_SREM:
9588                 case DIF_OP_UREM:
9589                 case DIF_OP_COPYS:
9590                         if (r1 >= nregs)
9591                                 err += efunc(pc, "invalid register %u\n", r1);
9592                         if (r2 >= nregs)
9593                                 err += efunc(pc, "invalid register %u\n", r2);
9594                         if (rd >= nregs)
9595                                 err += efunc(pc, "invalid register %u\n", rd);
9596                         if (rd == 0)
9597                                 err += efunc(pc, "cannot write to %r0\n");
9598                         break;
9599                 case DIF_OP_NOT:
9600                 case DIF_OP_MOV:
9601                 case DIF_OP_ALLOCS:
9602                         if (r1 >= nregs)
9603                                 err += efunc(pc, "invalid register %u\n", r1);
9604                         if (r2 != 0)
9605                                 err += efunc(pc, "non-zero reserved bits\n");
9606                         if (rd >= nregs)
9607                                 err += efunc(pc, "invalid register %u\n", rd);
9608                         if (rd == 0)
9609                                 err += efunc(pc, "cannot write to %r0\n");
9610                         break;
9611                 case DIF_OP_LDSB:
9612                 case DIF_OP_LDSH:
9613                 case DIF_OP_LDSW:
9614                 case DIF_OP_LDUB:
9615                 case DIF_OP_LDUH:
9616                 case DIF_OP_LDUW:
9617                 case DIF_OP_LDX:
9618                         if (r1 >= nregs)
9619                                 err += efunc(pc, "invalid register %u\n", r1);
9620                         if (r2 != 0)
9621                                 err += efunc(pc, "non-zero reserved bits\n");
9622                         if (rd >= nregs)
9623                                 err += efunc(pc, "invalid register %u\n", rd);
9624                         if (rd == 0)
9625                                 err += efunc(pc, "cannot write to %r0\n");
9626                         if (kcheckload)
9627                                 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9628                                     DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9629                         break;
9630                 case DIF_OP_RLDSB:
9631                 case DIF_OP_RLDSH:
9632                 case DIF_OP_RLDSW:
9633                 case DIF_OP_RLDUB:
9634                 case DIF_OP_RLDUH:
9635                 case DIF_OP_RLDUW:
9636                 case DIF_OP_RLDX:
9637                         if (r1 >= nregs)
9638                                 err += efunc(pc, "invalid register %u\n", r1);
9639                         if (r2 != 0)
9640                                 err += efunc(pc, "non-zero reserved bits\n");
9641                         if (rd >= nregs)
9642                                 err += efunc(pc, "invalid register %u\n", rd);
9643                         if (rd == 0)
9644                                 err += efunc(pc, "cannot write to %r0\n");
9645                         break;
9646                 case DIF_OP_ULDSB:
9647                 case DIF_OP_ULDSH:
9648                 case DIF_OP_ULDSW:
9649                 case DIF_OP_ULDUB:
9650                 case DIF_OP_ULDUH:
9651                 case DIF_OP_ULDUW:
9652                 case DIF_OP_ULDX:
9653                         if (r1 >= nregs)
9654                                 err += efunc(pc, "invalid register %u\n", r1);
9655                         if (r2 != 0)
9656                                 err += efunc(pc, "non-zero reserved bits\n");
9657                         if (rd >= nregs)
9658                                 err += efunc(pc, "invalid register %u\n", rd);
9659                         if (rd == 0)
9660                                 err += efunc(pc, "cannot write to %r0\n");
9661                         break;
9662                 case DIF_OP_STB:
9663                 case DIF_OP_STH:
9664                 case DIF_OP_STW:
9665                 case DIF_OP_STX:
9666                         if (r1 >= nregs)
9667                                 err += efunc(pc, "invalid register %u\n", r1);
9668                         if (r2 != 0)
9669                                 err += efunc(pc, "non-zero reserved bits\n");
9670                         if (rd >= nregs)
9671                                 err += efunc(pc, "invalid register %u\n", rd);
9672                         if (rd == 0)
9673                                 err += efunc(pc, "cannot write to 0 address\n");
9674                         break;
9675                 case DIF_OP_CMP:
9676                 case DIF_OP_SCMP:
9677                         if (r1 >= nregs)
9678                                 err += efunc(pc, "invalid register %u\n", r1);
9679                         if (r2 >= nregs)
9680                                 err += efunc(pc, "invalid register %u\n", r2);
9681                         if (rd != 0)
9682                                 err += efunc(pc, "non-zero reserved bits\n");
9683                         break;
9684                 case DIF_OP_TST:
9685                         if (r1 >= nregs)
9686                                 err += efunc(pc, "invalid register %u\n", r1);
9687                         if (r2 != 0 || rd != 0)
9688                                 err += efunc(pc, "non-zero reserved bits\n");
9689                         break;
9690                 case DIF_OP_BA:
9691                 case DIF_OP_BE:
9692                 case DIF_OP_BNE:
9693                 case DIF_OP_BG:
9694                 case DIF_OP_BGU:
9695                 case DIF_OP_BGE:
9696                 case DIF_OP_BGEU:
9697                 case DIF_OP_BL:
9698                 case DIF_OP_BLU:
9699                 case DIF_OP_BLE:
9700                 case DIF_OP_BLEU:
9701                         if (label >= dp->dtdo_len) {
9702                                 err += efunc(pc, "invalid branch target %u\n",
9703                                     label);
9704                         }
9705                         if (label <= pc) {
9706                                 err += efunc(pc, "backward branch to %u\n",
9707                                     label);
9708                         }
9709                         break;
9710                 case DIF_OP_RET:
9711                         if (r1 != 0 || r2 != 0)
9712                                 err += efunc(pc, "non-zero reserved bits\n");
9713                         if (rd >= nregs)
9714                                 err += efunc(pc, "invalid register %u\n", rd);
9715                         break;
9716                 case DIF_OP_NOP:
9717                 case DIF_OP_POPTS:
9718                 case DIF_OP_FLUSHTS:
9719                         if (r1 != 0 || r2 != 0 || rd != 0)
9720                                 err += efunc(pc, "non-zero reserved bits\n");
9721                         break;
9722                 case DIF_OP_SETX:
9723                         if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9724                                 err += efunc(pc, "invalid integer ref %u\n",
9725                                     DIF_INSTR_INTEGER(instr));
9726                         }
9727                         if (rd >= nregs)
9728                                 err += efunc(pc, "invalid register %u\n", rd);
9729                         if (rd == 0)
9730                                 err += efunc(pc, "cannot write to %r0\n");
9731                         break;
9732                 case DIF_OP_SETS:
9733                         if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9734                                 err += efunc(pc, "invalid string ref %u\n",
9735                                     DIF_INSTR_STRING(instr));
9736                         }
9737                         if (rd >= nregs)
9738                                 err += efunc(pc, "invalid register %u\n", rd);
9739                         if (rd == 0)
9740                                 err += efunc(pc, "cannot write to %r0\n");
9741                         break;
9742                 case DIF_OP_LDGA:
9743                 case DIF_OP_LDTA:
9744                         if (r1 > DIF_VAR_ARRAY_MAX)
9745                                 err += efunc(pc, "invalid array %u\n", r1);
9746                         if (r2 >= nregs)
9747                                 err += efunc(pc, "invalid register %u\n", r2);
9748                         if (rd >= nregs)
9749                                 err += efunc(pc, "invalid register %u\n", rd);
9750                         if (rd == 0)
9751                                 err += efunc(pc, "cannot write to %r0\n");
9752                         break;
9753                 case DIF_OP_LDGS:
9754                 case DIF_OP_LDTS:
9755                 case DIF_OP_LDLS:
9756                 case DIF_OP_LDGAA:
9757                 case DIF_OP_LDTAA:
9758                         if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9759                                 err += efunc(pc, "invalid variable %u\n", v);
9760                         if (rd >= nregs)
9761                                 err += efunc(pc, "invalid register %u\n", rd);
9762                         if (rd == 0)
9763                                 err += efunc(pc, "cannot write to %r0\n");
9764                         break;
9765                 case DIF_OP_STGS:
9766                 case DIF_OP_STTS:
9767                 case DIF_OP_STLS:
9768                 case DIF_OP_STGAA:
9769                 case DIF_OP_STTAA:
9770                         if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9771                                 err += efunc(pc, "invalid variable %u\n", v);
9772                         if (rs >= nregs)
9773                                 err += efunc(pc, "invalid register %u\n", rd);
9774                         break;
9775                 case DIF_OP_CALL:
9776                         if (subr > DIF_SUBR_MAX)
9777                                 err += efunc(pc, "invalid subr %u\n", subr);
9778                         if (rd >= nregs)
9779                                 err += efunc(pc, "invalid register %u\n", rd);
9780                         if (rd == 0)
9781                                 err += efunc(pc, "cannot write to %r0\n");
9782
9783                         if (subr == DIF_SUBR_COPYOUT ||
9784                             subr == DIF_SUBR_COPYOUTSTR) {
9785                                 dp->dtdo_destructive = 1;
9786                         }
9787
9788                         if (subr == DIF_SUBR_GETF) {
9789                                 /*
9790                                  * If we have a getf() we need to record that
9791                                  * in our state.  Note that our state can be
9792                                  * NULL if this is a helper -- but in that
9793                                  * case, the call to getf() is itself illegal,
9794                                  * and will be caught (slightly later) when
9795                                  * the helper is validated.
9796                                  */
9797                                 if (vstate->dtvs_state != NULL)
9798                                         vstate->dtvs_state->dts_getf++;
9799                         }
9800
9801                         break;
9802                 case DIF_OP_PUSHTR:
9803                         if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9804                                 err += efunc(pc, "invalid ref type %u\n", type);
9805                         if (r2 >= nregs)
9806                                 err += efunc(pc, "invalid register %u\n", r2);
9807                         if (rs >= nregs)
9808                                 err += efunc(pc, "invalid register %u\n", rs);
9809                         break;
9810                 case DIF_OP_PUSHTV:
9811                         if (type != DIF_TYPE_CTF)
9812                                 err += efunc(pc, "invalid val type %u\n", type);
9813                         if (r2 >= nregs)
9814                                 err += efunc(pc, "invalid register %u\n", r2);
9815                         if (rs >= nregs)
9816                                 err += efunc(pc, "invalid register %u\n", rs);
9817                         break;
9818                 default:
9819                         err += efunc(pc, "invalid opcode %u\n",
9820                             DIF_INSTR_OP(instr));
9821                 }
9822         }
9823
9824         if (dp->dtdo_len != 0 &&
9825             DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9826                 err += efunc(dp->dtdo_len - 1,
9827                     "expected 'ret' as last DIF instruction\n");
9828         }
9829
9830         if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9831                 /*
9832                  * If we're not returning by reference, the size must be either
9833                  * 0 or the size of one of the base types.
9834                  */
9835                 switch (dp->dtdo_rtype.dtdt_size) {
9836                 case 0:
9837                 case sizeof (uint8_t):
9838                 case sizeof (uint16_t):
9839                 case sizeof (uint32_t):
9840                 case sizeof (uint64_t):
9841                         break;
9842
9843                 default:
9844                         err += efunc(dp->dtdo_len - 1, "bad return size\n");
9845                 }
9846         }
9847
9848         for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9849                 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9850                 dtrace_diftype_t *vt, *et;
9851                 uint_t id, ndx;
9852
9853                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9854                     v->dtdv_scope != DIFV_SCOPE_THREAD &&
9855                     v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9856                         err += efunc(i, "unrecognized variable scope %d\n",
9857                             v->dtdv_scope);
9858                         break;
9859                 }
9860
9861                 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9862                     v->dtdv_kind != DIFV_KIND_SCALAR) {
9863                         err += efunc(i, "unrecognized variable type %d\n",
9864                             v->dtdv_kind);
9865                         break;
9866                 }
9867
9868                 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9869                         err += efunc(i, "%d exceeds variable id limit\n", id);
9870                         break;
9871                 }
9872
9873                 if (id < DIF_VAR_OTHER_UBASE)
9874                         continue;
9875
9876                 /*
9877                  * For user-defined variables, we need to check that this
9878                  * definition is identical to any previous definition that we
9879                  * encountered.
9880                  */
9881                 ndx = id - DIF_VAR_OTHER_UBASE;
9882
9883                 switch (v->dtdv_scope) {
9884                 case DIFV_SCOPE_GLOBAL:
9885                         if (ndx < vstate->dtvs_nglobals) {
9886                                 dtrace_statvar_t *svar;
9887
9888                                 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
9889                                         existing = &svar->dtsv_var;
9890                         }
9891
9892                         break;
9893
9894                 case DIFV_SCOPE_THREAD:
9895                         if (ndx < vstate->dtvs_ntlocals)
9896                                 existing = &vstate->dtvs_tlocals[ndx];
9897                         break;
9898
9899                 case DIFV_SCOPE_LOCAL:
9900                         if (ndx < vstate->dtvs_nlocals) {
9901                                 dtrace_statvar_t *svar;
9902
9903                                 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
9904                                         existing = &svar->dtsv_var;
9905                         }
9906
9907                         break;
9908                 }
9909
9910                 vt = &v->dtdv_type;
9911
9912                 if (vt->dtdt_flags & DIF_TF_BYREF) {
9913                         if (vt->dtdt_size == 0) {
9914                                 err += efunc(i, "zero-sized variable\n");
9915                                 break;
9916                         }
9917
9918                         if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
9919                             vt->dtdt_size > dtrace_global_maxsize) {
9920                                 err += efunc(i, "oversized by-ref global\n");
9921                                 break;
9922                         }
9923                 }
9924
9925                 if (existing == NULL || existing->dtdv_id == 0)
9926                         continue;
9927
9928                 ASSERT(existing->dtdv_id == v->dtdv_id);
9929                 ASSERT(existing->dtdv_scope == v->dtdv_scope);
9930
9931                 if (existing->dtdv_kind != v->dtdv_kind)
9932                         err += efunc(i, "%d changed variable kind\n", id);
9933
9934                 et = &existing->dtdv_type;
9935
9936                 if (vt->dtdt_flags != et->dtdt_flags) {
9937                         err += efunc(i, "%d changed variable type flags\n", id);
9938                         break;
9939                 }
9940
9941                 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
9942                         err += efunc(i, "%d changed variable type size\n", id);
9943                         break;
9944                 }
9945         }
9946
9947         return (err);
9948 }
9949
9950 /*
9951  * Validate a DTrace DIF object that it is to be used as a helper.  Helpers
9952  * are much more constrained than normal DIFOs.  Specifically, they may
9953  * not:
9954  *
9955  * 1. Make calls to subroutines other than copyin(), copyinstr() or
9956  *    miscellaneous string routines
9957  * 2. Access DTrace variables other than the args[] array, and the
9958  *    curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
9959  * 3. Have thread-local variables.
9960  * 4. Have dynamic variables.
9961  */
9962 static int
9963 dtrace_difo_validate_helper(dtrace_difo_t *dp)
9964 {
9965         int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9966         int err = 0;
9967         uint_t pc;
9968
9969         for (pc = 0; pc < dp->dtdo_len; pc++) {
9970                 dif_instr_t instr = dp->dtdo_buf[pc];
9971
9972                 uint_t v = DIF_INSTR_VAR(instr);
9973                 uint_t subr = DIF_INSTR_SUBR(instr);
9974                 uint_t op = DIF_INSTR_OP(instr);
9975
9976                 switch (op) {
9977                 case DIF_OP_OR:
9978                 case DIF_OP_XOR:
9979                 case DIF_OP_AND:
9980                 case DIF_OP_SLL:
9981                 case DIF_OP_SRL:
9982                 case DIF_OP_SRA:
9983                 case DIF_OP_SUB:
9984                 case DIF_OP_ADD:
9985                 case DIF_OP_MUL:
9986                 case DIF_OP_SDIV:
9987                 case DIF_OP_UDIV:
9988                 case DIF_OP_SREM:
9989                 case DIF_OP_UREM:
9990                 case DIF_OP_COPYS:
9991                 case DIF_OP_NOT:
9992                 case DIF_OP_MOV:
9993                 case DIF_OP_RLDSB:
9994                 case DIF_OP_RLDSH:
9995                 case DIF_OP_RLDSW:
9996                 case DIF_OP_RLDUB:
9997                 case DIF_OP_RLDUH:
9998                 case DIF_OP_RLDUW:
9999                 case DIF_OP_RLDX:
10000                 case DIF_OP_ULDSB:
10001                 case DIF_OP_ULDSH:
10002                 case DIF_OP_ULDSW:
10003                 case DIF_OP_ULDUB:
10004                 case DIF_OP_ULDUH:
10005                 case DIF_OP_ULDUW:
10006                 case DIF_OP_ULDX:
10007                 case DIF_OP_STB:
10008                 case DIF_OP_STH:
10009                 case DIF_OP_STW:
10010                 case DIF_OP_STX:
10011                 case DIF_OP_ALLOCS:
10012                 case DIF_OP_CMP:
10013                 case DIF_OP_SCMP:
10014                 case DIF_OP_TST:
10015                 case DIF_OP_BA:
10016                 case DIF_OP_BE:
10017                 case DIF_OP_BNE:
10018                 case DIF_OP_BG:
10019                 case DIF_OP_BGU:
10020                 case DIF_OP_BGE:
10021                 case DIF_OP_BGEU:
10022                 case DIF_OP_BL:
10023                 case DIF_OP_BLU:
10024                 case DIF_OP_BLE:
10025                 case DIF_OP_BLEU:
10026                 case DIF_OP_RET:
10027                 case DIF_OP_NOP:
10028                 case DIF_OP_POPTS:
10029                 case DIF_OP_FLUSHTS:
10030                 case DIF_OP_SETX:
10031                 case DIF_OP_SETS:
10032                 case DIF_OP_LDGA:
10033                 case DIF_OP_LDLS:
10034                 case DIF_OP_STGS:
10035                 case DIF_OP_STLS:
10036                 case DIF_OP_PUSHTR:
10037                 case DIF_OP_PUSHTV:
10038                         break;
10039
10040                 case DIF_OP_LDGS:
10041                         if (v >= DIF_VAR_OTHER_UBASE)
10042                                 break;
10043
10044                         if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10045                                 break;
10046
10047                         if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10048                             v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10049                             v == DIF_VAR_EXECARGS ||
10050                             v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10051                             v == DIF_VAR_UID || v == DIF_VAR_GID)
10052                                 break;
10053
10054                         err += efunc(pc, "illegal variable %u\n", v);
10055                         break;
10056
10057                 case DIF_OP_LDTA:
10058                 case DIF_OP_LDTS:
10059                 case DIF_OP_LDGAA:
10060                 case DIF_OP_LDTAA:
10061                         err += efunc(pc, "illegal dynamic variable load\n");
10062                         break;
10063
10064                 case DIF_OP_STTS:
10065                 case DIF_OP_STGAA:
10066                 case DIF_OP_STTAA:
10067                         err += efunc(pc, "illegal dynamic variable store\n");
10068                         break;
10069
10070                 case DIF_OP_CALL:
10071                         if (subr == DIF_SUBR_ALLOCA ||
10072                             subr == DIF_SUBR_BCOPY ||
10073                             subr == DIF_SUBR_COPYIN ||
10074                             subr == DIF_SUBR_COPYINTO ||
10075                             subr == DIF_SUBR_COPYINSTR ||
10076                             subr == DIF_SUBR_INDEX ||
10077                             subr == DIF_SUBR_INET_NTOA ||
10078                             subr == DIF_SUBR_INET_NTOA6 ||
10079                             subr == DIF_SUBR_INET_NTOP ||
10080                             subr == DIF_SUBR_JSON ||
10081                             subr == DIF_SUBR_LLTOSTR ||
10082                             subr == DIF_SUBR_STRTOLL ||
10083                             subr == DIF_SUBR_RINDEX ||
10084                             subr == DIF_SUBR_STRCHR ||
10085                             subr == DIF_SUBR_STRJOIN ||
10086                             subr == DIF_SUBR_STRRCHR ||
10087                             subr == DIF_SUBR_STRSTR ||
10088                             subr == DIF_SUBR_HTONS ||
10089                             subr == DIF_SUBR_HTONL ||
10090                             subr == DIF_SUBR_HTONLL ||
10091                             subr == DIF_SUBR_NTOHS ||
10092                             subr == DIF_SUBR_NTOHL ||
10093                             subr == DIF_SUBR_NTOHLL ||
10094                             subr == DIF_SUBR_MEMREF ||
10095 #if !defined(sun)
10096                             subr == DIF_SUBR_MEMSTR ||
10097 #endif
10098                             subr == DIF_SUBR_TYPEREF)
10099                                 break;
10100
10101                         err += efunc(pc, "invalid subr %u\n", subr);
10102                         break;
10103
10104                 default:
10105                         err += efunc(pc, "invalid opcode %u\n",
10106                             DIF_INSTR_OP(instr));
10107                 }
10108         }
10109
10110         return (err);
10111 }
10112
10113 /*
10114  * Returns 1 if the expression in the DIF object can be cached on a per-thread
10115  * basis; 0 if not.
10116  */
10117 static int
10118 dtrace_difo_cacheable(dtrace_difo_t *dp)
10119 {
10120         int i;
10121
10122         if (dp == NULL)
10123                 return (0);
10124
10125         for (i = 0; i < dp->dtdo_varlen; i++) {
10126                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10127
10128                 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10129                         continue;
10130
10131                 switch (v->dtdv_id) {
10132                 case DIF_VAR_CURTHREAD:
10133                 case DIF_VAR_PID:
10134                 case DIF_VAR_TID:
10135                 case DIF_VAR_EXECARGS:
10136                 case DIF_VAR_EXECNAME:
10137                 case DIF_VAR_ZONENAME:
10138                         break;
10139
10140                 default:
10141                         return (0);
10142                 }
10143         }
10144
10145         /*
10146          * This DIF object may be cacheable.  Now we need to look for any
10147          * array loading instructions, any memory loading instructions, or
10148          * any stores to thread-local variables.
10149          */
10150         for (i = 0; i < dp->dtdo_len; i++) {
10151                 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10152
10153                 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10154                     (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10155                     (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10156                     op == DIF_OP_LDGA || op == DIF_OP_STTS)
10157                         return (0);
10158         }
10159
10160         return (1);
10161 }
10162
10163 static void
10164 dtrace_difo_hold(dtrace_difo_t *dp)
10165 {
10166         int i;
10167
10168         ASSERT(MUTEX_HELD(&dtrace_lock));
10169
10170         dp->dtdo_refcnt++;
10171         ASSERT(dp->dtdo_refcnt != 0);
10172
10173         /*
10174          * We need to check this DIF object for references to the variable
10175          * DIF_VAR_VTIMESTAMP.
10176          */
10177         for (i = 0; i < dp->dtdo_varlen; i++) {
10178                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10179
10180                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10181                         continue;
10182
10183                 if (dtrace_vtime_references++ == 0)
10184                         dtrace_vtime_enable();
10185         }
10186 }
10187
10188 /*
10189  * This routine calculates the dynamic variable chunksize for a given DIF
10190  * object.  The calculation is not fool-proof, and can probably be tricked by
10191  * malicious DIF -- but it works for all compiler-generated DIF.  Because this
10192  * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10193  * if a dynamic variable size exceeds the chunksize.
10194  */
10195 static void
10196 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10197 {
10198         uint64_t sval = 0;
10199         dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10200         const dif_instr_t *text = dp->dtdo_buf;
10201         uint_t pc, srd = 0;
10202         uint_t ttop = 0;
10203         size_t size, ksize;
10204         uint_t id, i;
10205
10206         for (pc = 0; pc < dp->dtdo_len; pc++) {
10207                 dif_instr_t instr = text[pc];
10208                 uint_t op = DIF_INSTR_OP(instr);
10209                 uint_t rd = DIF_INSTR_RD(instr);
10210                 uint_t r1 = DIF_INSTR_R1(instr);
10211                 uint_t nkeys = 0;
10212                 uchar_t scope = 0;
10213
10214                 dtrace_key_t *key = tupregs;
10215
10216                 switch (op) {
10217                 case DIF_OP_SETX:
10218                         sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10219                         srd = rd;
10220                         continue;
10221
10222                 case DIF_OP_STTS:
10223                         key = &tupregs[DIF_DTR_NREGS];
10224                         key[0].dttk_size = 0;
10225                         key[1].dttk_size = 0;
10226                         nkeys = 2;
10227                         scope = DIFV_SCOPE_THREAD;
10228                         break;
10229
10230                 case DIF_OP_STGAA:
10231                 case DIF_OP_STTAA:
10232                         nkeys = ttop;
10233
10234                         if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10235                                 key[nkeys++].dttk_size = 0;
10236
10237                         key[nkeys++].dttk_size = 0;
10238
10239                         if (op == DIF_OP_STTAA) {
10240                                 scope = DIFV_SCOPE_THREAD;
10241                         } else {
10242                                 scope = DIFV_SCOPE_GLOBAL;
10243                         }
10244
10245                         break;
10246
10247                 case DIF_OP_PUSHTR:
10248                         if (ttop == DIF_DTR_NREGS)
10249                                 return;
10250
10251                         if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10252                                 /*
10253                                  * If the register for the size of the "pushtr"
10254                                  * is %r0 (or the value is 0) and the type is
10255                                  * a string, we'll use the system-wide default
10256                                  * string size.
10257                                  */
10258                                 tupregs[ttop++].dttk_size =
10259                                     dtrace_strsize_default;
10260                         } else {
10261                                 if (srd == 0)
10262                                         return;
10263
10264                                 tupregs[ttop++].dttk_size = sval;
10265                         }
10266
10267                         break;
10268
10269                 case DIF_OP_PUSHTV:
10270                         if (ttop == DIF_DTR_NREGS)
10271                                 return;
10272
10273                         tupregs[ttop++].dttk_size = 0;
10274                         break;
10275
10276                 case DIF_OP_FLUSHTS:
10277                         ttop = 0;
10278                         break;
10279
10280                 case DIF_OP_POPTS:
10281                         if (ttop != 0)
10282                                 ttop--;
10283                         break;
10284                 }
10285
10286                 sval = 0;
10287                 srd = 0;
10288
10289                 if (nkeys == 0)
10290                         continue;
10291
10292                 /*
10293                  * We have a dynamic variable allocation; calculate its size.
10294                  */
10295                 for (ksize = 0, i = 0; i < nkeys; i++)
10296                         ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10297
10298                 size = sizeof (dtrace_dynvar_t);
10299                 size += sizeof (dtrace_key_t) * (nkeys - 1);
10300                 size += ksize;
10301
10302                 /*
10303                  * Now we need to determine the size of the stored data.
10304                  */
10305                 id = DIF_INSTR_VAR(instr);
10306
10307                 for (i = 0; i < dp->dtdo_varlen; i++) {
10308                         dtrace_difv_t *v = &dp->dtdo_vartab[i];
10309
10310                         if (v->dtdv_id == id && v->dtdv_scope == scope) {
10311                                 size += v->dtdv_type.dtdt_size;
10312                                 break;
10313                         }
10314                 }
10315
10316                 if (i == dp->dtdo_varlen)
10317                         return;
10318
10319                 /*
10320                  * We have the size.  If this is larger than the chunk size
10321                  * for our dynamic variable state, reset the chunk size.
10322                  */
10323                 size = P2ROUNDUP(size, sizeof (uint64_t));
10324
10325                 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10326                         vstate->dtvs_dynvars.dtds_chunksize = size;
10327         }
10328 }
10329
10330 static void
10331 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10332 {
10333         int i, oldsvars, osz, nsz, otlocals, ntlocals;
10334         uint_t id;
10335
10336         ASSERT(MUTEX_HELD(&dtrace_lock));
10337         ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10338
10339         for (i = 0; i < dp->dtdo_varlen; i++) {
10340                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10341                 dtrace_statvar_t *svar, ***svarp = NULL;
10342                 size_t dsize = 0;
10343                 uint8_t scope = v->dtdv_scope;
10344                 int *np = NULL;
10345
10346                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10347                         continue;
10348
10349                 id -= DIF_VAR_OTHER_UBASE;
10350
10351                 switch (scope) {
10352                 case DIFV_SCOPE_THREAD:
10353                         while (id >= (otlocals = vstate->dtvs_ntlocals)) {
10354                                 dtrace_difv_t *tlocals;
10355
10356                                 if ((ntlocals = (otlocals << 1)) == 0)
10357                                         ntlocals = 1;
10358
10359                                 osz = otlocals * sizeof (dtrace_difv_t);
10360                                 nsz = ntlocals * sizeof (dtrace_difv_t);
10361
10362                                 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10363
10364                                 if (osz != 0) {
10365                                         bcopy(vstate->dtvs_tlocals,
10366                                             tlocals, osz);
10367                                         kmem_free(vstate->dtvs_tlocals, osz);
10368                                 }
10369
10370                                 vstate->dtvs_tlocals = tlocals;
10371                                 vstate->dtvs_ntlocals = ntlocals;
10372                         }
10373
10374                         vstate->dtvs_tlocals[id] = *v;
10375                         continue;
10376
10377                 case DIFV_SCOPE_LOCAL:
10378                         np = &vstate->dtvs_nlocals;
10379                         svarp = &vstate->dtvs_locals;
10380
10381                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10382                                 dsize = NCPU * (v->dtdv_type.dtdt_size +
10383                                     sizeof (uint64_t));
10384                         else
10385                                 dsize = NCPU * sizeof (uint64_t);
10386
10387                         break;
10388
10389                 case DIFV_SCOPE_GLOBAL:
10390                         np = &vstate->dtvs_nglobals;
10391                         svarp = &vstate->dtvs_globals;
10392
10393                         if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10394                                 dsize = v->dtdv_type.dtdt_size +
10395                                     sizeof (uint64_t);
10396
10397                         break;
10398
10399                 default:
10400                         ASSERT(0);
10401                 }
10402
10403                 while (id >= (oldsvars = *np)) {
10404                         dtrace_statvar_t **statics;
10405                         int newsvars, oldsize, newsize;
10406
10407                         if ((newsvars = (oldsvars << 1)) == 0)
10408                                 newsvars = 1;
10409
10410                         oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10411                         newsize = newsvars * sizeof (dtrace_statvar_t *);
10412
10413                         statics = kmem_zalloc(newsize, KM_SLEEP);
10414
10415                         if (oldsize != 0) {
10416                                 bcopy(*svarp, statics, oldsize);
10417                                 kmem_free(*svarp, oldsize);
10418                         }
10419
10420                         *svarp = statics;
10421                         *np = newsvars;
10422                 }
10423
10424                 if ((svar = (*svarp)[id]) == NULL) {
10425                         svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10426                         svar->dtsv_var = *v;
10427
10428                         if ((svar->dtsv_size = dsize) != 0) {
10429                                 svar->dtsv_data = (uint64_t)(uintptr_t)
10430                                     kmem_zalloc(dsize, KM_SLEEP);
10431                         }
10432
10433                         (*svarp)[id] = svar;
10434                 }
10435
10436                 svar->dtsv_refcnt++;
10437         }
10438
10439         dtrace_difo_chunksize(dp, vstate);
10440         dtrace_difo_hold(dp);
10441 }
10442
10443 static dtrace_difo_t *
10444 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10445 {
10446         dtrace_difo_t *new;
10447         size_t sz;
10448
10449         ASSERT(dp->dtdo_buf != NULL);
10450         ASSERT(dp->dtdo_refcnt != 0);
10451
10452         new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10453
10454         ASSERT(dp->dtdo_buf != NULL);
10455         sz = dp->dtdo_len * sizeof (dif_instr_t);
10456         new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10457         bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10458         new->dtdo_len = dp->dtdo_len;
10459
10460         if (dp->dtdo_strtab != NULL) {
10461                 ASSERT(dp->dtdo_strlen != 0);
10462                 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10463                 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10464                 new->dtdo_strlen = dp->dtdo_strlen;
10465         }
10466
10467         if (dp->dtdo_inttab != NULL) {
10468                 ASSERT(dp->dtdo_intlen != 0);
10469                 sz = dp->dtdo_intlen * sizeof (uint64_t);
10470                 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10471                 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10472                 new->dtdo_intlen = dp->dtdo_intlen;
10473         }
10474
10475         if (dp->dtdo_vartab != NULL) {
10476                 ASSERT(dp->dtdo_varlen != 0);
10477                 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10478                 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10479                 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10480                 new->dtdo_varlen = dp->dtdo_varlen;
10481         }
10482
10483         dtrace_difo_init(new, vstate);
10484         return (new);
10485 }
10486
10487 static void
10488 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10489 {
10490         int i;
10491
10492         ASSERT(dp->dtdo_refcnt == 0);
10493
10494         for (i = 0; i < dp->dtdo_varlen; i++) {
10495                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10496                 dtrace_statvar_t *svar, **svarp = NULL;
10497                 uint_t id;
10498                 uint8_t scope = v->dtdv_scope;
10499                 int *np = NULL;
10500
10501                 switch (scope) {
10502                 case DIFV_SCOPE_THREAD:
10503                         continue;
10504
10505                 case DIFV_SCOPE_LOCAL:
10506                         np = &vstate->dtvs_nlocals;
10507                         svarp = vstate->dtvs_locals;
10508                         break;
10509
10510                 case DIFV_SCOPE_GLOBAL:
10511                         np = &vstate->dtvs_nglobals;
10512                         svarp = vstate->dtvs_globals;
10513                         break;
10514
10515                 default:
10516                         ASSERT(0);
10517                 }
10518
10519                 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10520                         continue;
10521
10522                 id -= DIF_VAR_OTHER_UBASE;
10523                 ASSERT(id < *np);
10524
10525                 svar = svarp[id];
10526                 ASSERT(svar != NULL);
10527                 ASSERT(svar->dtsv_refcnt > 0);
10528
10529                 if (--svar->dtsv_refcnt > 0)
10530                         continue;
10531
10532                 if (svar->dtsv_size != 0) {
10533                         ASSERT(svar->dtsv_data != 0);
10534                         kmem_free((void *)(uintptr_t)svar->dtsv_data,
10535                             svar->dtsv_size);
10536                 }
10537
10538                 kmem_free(svar, sizeof (dtrace_statvar_t));
10539                 svarp[id] = NULL;
10540         }
10541
10542         if (dp->dtdo_buf != NULL)
10543                 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10544         if (dp->dtdo_inttab != NULL)
10545                 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10546         if (dp->dtdo_strtab != NULL)
10547                 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10548         if (dp->dtdo_vartab != NULL)
10549                 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10550
10551         kmem_free(dp, sizeof (dtrace_difo_t));
10552 }
10553
10554 static void
10555 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10556 {
10557         int i;
10558
10559         ASSERT(MUTEX_HELD(&dtrace_lock));
10560         ASSERT(dp->dtdo_refcnt != 0);
10561
10562         for (i = 0; i < dp->dtdo_varlen; i++) {
10563                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10564
10565                 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10566                         continue;
10567
10568                 ASSERT(dtrace_vtime_references > 0);
10569                 if (--dtrace_vtime_references == 0)
10570                         dtrace_vtime_disable();
10571         }
10572
10573         if (--dp->dtdo_refcnt == 0)
10574                 dtrace_difo_destroy(dp, vstate);
10575 }
10576
10577 /*
10578  * DTrace Format Functions
10579  */
10580 static uint16_t
10581 dtrace_format_add(dtrace_state_t *state, char *str)
10582 {
10583         char *fmt, **new;
10584         uint16_t ndx, len = strlen(str) + 1;
10585
10586         fmt = kmem_zalloc(len, KM_SLEEP);
10587         bcopy(str, fmt, len);
10588
10589         for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10590                 if (state->dts_formats[ndx] == NULL) {
10591                         state->dts_formats[ndx] = fmt;
10592                         return (ndx + 1);
10593                 }
10594         }
10595
10596         if (state->dts_nformats == USHRT_MAX) {
10597                 /*
10598                  * This is only likely if a denial-of-service attack is being
10599                  * attempted.  As such, it's okay to fail silently here.
10600                  */
10601                 kmem_free(fmt, len);
10602                 return (0);
10603         }
10604
10605         /*
10606          * For simplicity, we always resize the formats array to be exactly the
10607          * number of formats.
10608          */
10609         ndx = state->dts_nformats++;
10610         new = kmem_alloc((ndx + 1) * sizeof (char *), KM_SLEEP);
10611
10612         if (state->dts_formats != NULL) {
10613                 ASSERT(ndx != 0);
10614                 bcopy(state->dts_formats, new, ndx * sizeof (char *));
10615                 kmem_free(state->dts_formats, ndx * sizeof (char *));
10616         }
10617
10618         state->dts_formats = new;
10619         state->dts_formats[ndx] = fmt;
10620
10621         return (ndx + 1);
10622 }
10623
10624 static void
10625 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10626 {
10627         char *fmt;
10628
10629         ASSERT(state->dts_formats != NULL);
10630         ASSERT(format <= state->dts_nformats);
10631         ASSERT(state->dts_formats[format - 1] != NULL);
10632
10633         fmt = state->dts_formats[format - 1];
10634         kmem_free(fmt, strlen(fmt) + 1);
10635         state->dts_formats[format - 1] = NULL;
10636 }
10637
10638 static void
10639 dtrace_format_destroy(dtrace_state_t *state)
10640 {
10641         int i;
10642
10643         if (state->dts_nformats == 0) {
10644                 ASSERT(state->dts_formats == NULL);
10645                 return;
10646         }
10647
10648         ASSERT(state->dts_formats != NULL);
10649
10650         for (i = 0; i < state->dts_nformats; i++) {
10651                 char *fmt = state->dts_formats[i];
10652
10653                 if (fmt == NULL)
10654                         continue;
10655
10656                 kmem_free(fmt, strlen(fmt) + 1);
10657         }
10658
10659         kmem_free(state->dts_formats, state->dts_nformats * sizeof (char *));
10660         state->dts_nformats = 0;
10661         state->dts_formats = NULL;
10662 }
10663
10664 /*
10665  * DTrace Predicate Functions
10666  */
10667 static dtrace_predicate_t *
10668 dtrace_predicate_create(dtrace_difo_t *dp)
10669 {
10670         dtrace_predicate_t *pred;
10671
10672         ASSERT(MUTEX_HELD(&dtrace_lock));
10673         ASSERT(dp->dtdo_refcnt != 0);
10674
10675         pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10676         pred->dtp_difo = dp;
10677         pred->dtp_refcnt = 1;
10678
10679         if (!dtrace_difo_cacheable(dp))
10680                 return (pred);
10681
10682         if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10683                 /*
10684                  * This is only theoretically possible -- we have had 2^32
10685                  * cacheable predicates on this machine.  We cannot allow any
10686                  * more predicates to become cacheable:  as unlikely as it is,
10687                  * there may be a thread caching a (now stale) predicate cache
10688                  * ID. (N.B.: the temptation is being successfully resisted to
10689                  * have this cmn_err() "Holy shit -- we executed this code!")
10690                  */
10691                 return (pred);
10692         }
10693
10694         pred->dtp_cacheid = dtrace_predcache_id++;
10695
10696         return (pred);
10697 }
10698
10699 static void
10700 dtrace_predicate_hold(dtrace_predicate_t *pred)
10701 {
10702         ASSERT(MUTEX_HELD(&dtrace_lock));
10703         ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10704         ASSERT(pred->dtp_refcnt > 0);
10705
10706         pred->dtp_refcnt++;
10707 }
10708
10709 static void
10710 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10711 {
10712         dtrace_difo_t *dp = pred->dtp_difo;
10713
10714         ASSERT(MUTEX_HELD(&dtrace_lock));
10715         ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10716         ASSERT(pred->dtp_refcnt > 0);
10717
10718         if (--pred->dtp_refcnt == 0) {
10719                 dtrace_difo_release(pred->dtp_difo, vstate);
10720                 kmem_free(pred, sizeof (dtrace_predicate_t));
10721         }
10722 }
10723
10724 /*
10725  * DTrace Action Description Functions
10726  */
10727 static dtrace_actdesc_t *
10728 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10729     uint64_t uarg, uint64_t arg)
10730 {
10731         dtrace_actdesc_t *act;
10732
10733 #if defined(sun)
10734         ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
10735             arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
10736 #endif
10737
10738         act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10739         act->dtad_kind = kind;
10740         act->dtad_ntuple = ntuple;
10741         act->dtad_uarg = uarg;
10742         act->dtad_arg = arg;
10743         act->dtad_refcnt = 1;
10744
10745         return (act);
10746 }
10747
10748 static void
10749 dtrace_actdesc_hold(dtrace_actdesc_t *act)
10750 {
10751         ASSERT(act->dtad_refcnt >= 1);
10752         act->dtad_refcnt++;
10753 }
10754
10755 static void
10756 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10757 {
10758         dtrace_actkind_t kind = act->dtad_kind;
10759         dtrace_difo_t *dp;
10760
10761         ASSERT(act->dtad_refcnt >= 1);
10762
10763         if (--act->dtad_refcnt != 0)
10764                 return;
10765
10766         if ((dp = act->dtad_difo) != NULL)
10767                 dtrace_difo_release(dp, vstate);
10768
10769         if (DTRACEACT_ISPRINTFLIKE(kind)) {
10770                 char *str = (char *)(uintptr_t)act->dtad_arg;
10771
10772 #if defined(sun)
10773                 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10774                     (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10775 #endif
10776
10777                 if (str != NULL)
10778                         kmem_free(str, strlen(str) + 1);
10779         }
10780
10781         kmem_free(act, sizeof (dtrace_actdesc_t));
10782 }
10783
10784 /*
10785  * DTrace ECB Functions
10786  */
10787 static dtrace_ecb_t *
10788 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10789 {
10790         dtrace_ecb_t *ecb;
10791         dtrace_epid_t epid;
10792
10793         ASSERT(MUTEX_HELD(&dtrace_lock));
10794
10795         ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10796         ecb->dte_predicate = NULL;
10797         ecb->dte_probe = probe;
10798
10799         /*
10800          * The default size is the size of the default action: recording
10801          * the header.
10802          */
10803         ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
10804         ecb->dte_alignment = sizeof (dtrace_epid_t);
10805
10806         epid = state->dts_epid++;
10807
10808         if (epid - 1 >= state->dts_necbs) {
10809                 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
10810                 int necbs = state->dts_necbs << 1;
10811
10812                 ASSERT(epid == state->dts_necbs + 1);
10813
10814                 if (necbs == 0) {
10815                         ASSERT(oecbs == NULL);
10816                         necbs = 1;
10817                 }
10818
10819                 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
10820
10821                 if (oecbs != NULL)
10822                         bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
10823
10824                 dtrace_membar_producer();
10825                 state->dts_ecbs = ecbs;
10826
10827                 if (oecbs != NULL) {
10828                         /*
10829                          * If this state is active, we must dtrace_sync()
10830                          * before we can free the old dts_ecbs array:  we're
10831                          * coming in hot, and there may be active ring
10832                          * buffer processing (which indexes into the dts_ecbs
10833                          * array) on another CPU.
10834                          */
10835                         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
10836                                 dtrace_sync();
10837
10838                         kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
10839                 }
10840
10841                 dtrace_membar_producer();
10842                 state->dts_necbs = necbs;
10843         }
10844
10845         ecb->dte_state = state;
10846
10847         ASSERT(state->dts_ecbs[epid - 1] == NULL);
10848         dtrace_membar_producer();
10849         state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
10850
10851         return (ecb);
10852 }
10853
10854 static void
10855 dtrace_ecb_enable(dtrace_ecb_t *ecb)
10856 {
10857         dtrace_probe_t *probe = ecb->dte_probe;
10858
10859         ASSERT(MUTEX_HELD(&cpu_lock));
10860         ASSERT(MUTEX_HELD(&dtrace_lock));
10861         ASSERT(ecb->dte_next == NULL);
10862
10863         if (probe == NULL) {
10864                 /*
10865                  * This is the NULL probe -- there's nothing to do.
10866                  */
10867                 return;
10868         }
10869
10870         if (probe->dtpr_ecb == NULL) {
10871                 dtrace_provider_t *prov = probe->dtpr_provider;
10872
10873                 /*
10874                  * We're the first ECB on this probe.
10875                  */
10876                 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
10877
10878                 if (ecb->dte_predicate != NULL)
10879                         probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
10880
10881                 prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
10882                     probe->dtpr_id, probe->dtpr_arg);
10883         } else {
10884                 /*
10885                  * This probe is already active.  Swing the last pointer to
10886                  * point to the new ECB, and issue a dtrace_sync() to assure
10887                  * that all CPUs have seen the change.
10888                  */
10889                 ASSERT(probe->dtpr_ecb_last != NULL);
10890                 probe->dtpr_ecb_last->dte_next = ecb;
10891                 probe->dtpr_ecb_last = ecb;
10892                 probe->dtpr_predcache = 0;
10893
10894                 dtrace_sync();
10895         }
10896 }
10897
10898 static void
10899 dtrace_ecb_resize(dtrace_ecb_t *ecb)
10900 {
10901         dtrace_action_t *act;
10902         uint32_t curneeded = UINT32_MAX;
10903         uint32_t aggbase = UINT32_MAX;
10904
10905         /*
10906          * If we record anything, we always record the dtrace_rechdr_t.  (And
10907          * we always record it first.)
10908          */
10909         ecb->dte_size = sizeof (dtrace_rechdr_t);
10910         ecb->dte_alignment = sizeof (dtrace_epid_t);
10911
10912         for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
10913                 dtrace_recdesc_t *rec = &act->dta_rec;
10914                 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
10915
10916                 ecb->dte_alignment = MAX(ecb->dte_alignment,
10917                     rec->dtrd_alignment);
10918
10919                 if (DTRACEACT_ISAGG(act->dta_kind)) {
10920                         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
10921
10922                         ASSERT(rec->dtrd_size != 0);
10923                         ASSERT(agg->dtag_first != NULL);
10924                         ASSERT(act->dta_prev->dta_intuple);
10925                         ASSERT(aggbase != UINT32_MAX);
10926                         ASSERT(curneeded != UINT32_MAX);
10927
10928                         agg->dtag_base = aggbase;
10929
10930                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10931                         rec->dtrd_offset = curneeded;
10932                         curneeded += rec->dtrd_size;
10933                         ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
10934
10935                         aggbase = UINT32_MAX;
10936                         curneeded = UINT32_MAX;
10937                 } else if (act->dta_intuple) {
10938                         if (curneeded == UINT32_MAX) {
10939                                 /*
10940                                  * This is the first record in a tuple.  Align
10941                                  * curneeded to be at offset 4 in an 8-byte
10942                                  * aligned block.
10943                                  */
10944                                 ASSERT(act->dta_prev == NULL ||
10945                                     !act->dta_prev->dta_intuple);
10946                                 ASSERT3U(aggbase, ==, UINT32_MAX);
10947                                 curneeded = P2PHASEUP(ecb->dte_size,
10948                                     sizeof (uint64_t), sizeof (dtrace_aggid_t));
10949
10950                                 aggbase = curneeded - sizeof (dtrace_aggid_t);
10951                                 ASSERT(IS_P2ALIGNED(aggbase,
10952                                     sizeof (uint64_t)));
10953                         }
10954                         curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
10955                         rec->dtrd_offset = curneeded;
10956                         curneeded += rec->dtrd_size;
10957                 } else {
10958                         /* tuples must be followed by an aggregation */
10959                         ASSERT(act->dta_prev == NULL ||
10960                             !act->dta_prev->dta_intuple);
10961
10962                         ecb->dte_size = P2ROUNDUP(ecb->dte_size,
10963                             rec->dtrd_alignment);
10964                         rec->dtrd_offset = ecb->dte_size;
10965                         ecb->dte_size += rec->dtrd_size;
10966                         ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
10967                 }
10968         }
10969
10970         if ((act = ecb->dte_action) != NULL &&
10971             !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
10972             ecb->dte_size == sizeof (dtrace_rechdr_t)) {
10973                 /*
10974                  * If the size is still sizeof (dtrace_rechdr_t), then all
10975                  * actions store no data; set the size to 0.
10976                  */
10977                 ecb->dte_size = 0;
10978         }
10979
10980         ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
10981         ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
10982         ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
10983             ecb->dte_needed);
10984 }
10985
10986 static dtrace_action_t *
10987 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
10988 {
10989         dtrace_aggregation_t *agg;
10990         size_t size = sizeof (uint64_t);
10991         int ntuple = desc->dtad_ntuple;
10992         dtrace_action_t *act;
10993         dtrace_recdesc_t *frec;
10994         dtrace_aggid_t aggid;
10995         dtrace_state_t *state = ecb->dte_state;
10996
10997         agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
10998         agg->dtag_ecb = ecb;
10999
11000         ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
11001
11002         switch (desc->dtad_kind) {
11003         case DTRACEAGG_MIN:
11004                 agg->dtag_initial = INT64_MAX;
11005                 agg->dtag_aggregate = dtrace_aggregate_min;
11006                 break;
11007
11008         case DTRACEAGG_MAX:
11009                 agg->dtag_initial = INT64_MIN;
11010                 agg->dtag_aggregate = dtrace_aggregate_max;
11011                 break;
11012
11013         case DTRACEAGG_COUNT:
11014                 agg->dtag_aggregate = dtrace_aggregate_count;
11015                 break;
11016
11017         case DTRACEAGG_QUANTIZE:
11018                 agg->dtag_aggregate = dtrace_aggregate_quantize;
11019                 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11020                     sizeof (uint64_t);
11021                 break;
11022
11023         case DTRACEAGG_LQUANTIZE: {
11024                 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11025                 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11026
11027                 agg->dtag_initial = desc->dtad_arg;
11028                 agg->dtag_aggregate = dtrace_aggregate_lquantize;
11029
11030                 if (step == 0 || levels == 0)
11031                         goto err;
11032
11033                 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11034                 break;
11035         }
11036
11037         case DTRACEAGG_LLQUANTIZE: {
11038                 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11039                 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11040                 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11041                 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11042                 int64_t v;
11043
11044                 agg->dtag_initial = desc->dtad_arg;
11045                 agg->dtag_aggregate = dtrace_aggregate_llquantize;
11046
11047                 if (factor < 2 || low >= high || nsteps < factor)
11048                         goto err;
11049
11050                 /*
11051                  * Now check that the number of steps evenly divides a power
11052                  * of the factor.  (This assures both integer bucket size and
11053                  * linearity within each magnitude.)
11054                  */
11055                 for (v = factor; v < nsteps; v *= factor)
11056                         continue;
11057
11058                 if ((v % nsteps) || (nsteps % factor))
11059                         goto err;
11060
11061                 size = (dtrace_aggregate_llquantize_bucket(factor,
11062                     low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11063                 break;
11064         }
11065
11066         case DTRACEAGG_AVG:
11067                 agg->dtag_aggregate = dtrace_aggregate_avg;
11068                 size = sizeof (uint64_t) * 2;
11069                 break;
11070
11071         case DTRACEAGG_STDDEV:
11072                 agg->dtag_aggregate = dtrace_aggregate_stddev;
11073                 size = sizeof (uint64_t) * 4;
11074                 break;
11075
11076         case DTRACEAGG_SUM:
11077                 agg->dtag_aggregate = dtrace_aggregate_sum;
11078                 break;
11079
11080         default:
11081                 goto err;
11082         }
11083
11084         agg->dtag_action.dta_rec.dtrd_size = size;
11085
11086         if (ntuple == 0)
11087                 goto err;
11088
11089         /*
11090          * We must make sure that we have enough actions for the n-tuple.
11091          */
11092         for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11093                 if (DTRACEACT_ISAGG(act->dta_kind))
11094                         break;
11095
11096                 if (--ntuple == 0) {
11097                         /*
11098                          * This is the action with which our n-tuple begins.
11099                          */
11100                         agg->dtag_first = act;
11101                         goto success;
11102                 }
11103         }
11104
11105         /*
11106          * This n-tuple is short by ntuple elements.  Return failure.
11107          */
11108         ASSERT(ntuple != 0);
11109 err:
11110         kmem_free(agg, sizeof (dtrace_aggregation_t));
11111         return (NULL);
11112
11113 success:
11114         /*
11115          * If the last action in the tuple has a size of zero, it's actually
11116          * an expression argument for the aggregating action.
11117          */
11118         ASSERT(ecb->dte_action_last != NULL);
11119         act = ecb->dte_action_last;
11120
11121         if (act->dta_kind == DTRACEACT_DIFEXPR) {
11122                 ASSERT(act->dta_difo != NULL);
11123
11124                 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11125                         agg->dtag_hasarg = 1;
11126         }
11127
11128         /*
11129          * We need to allocate an id for this aggregation.
11130          */
11131 #if defined(sun)
11132         aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11133             VM_BESTFIT | VM_SLEEP);
11134 #else
11135         aggid = alloc_unr(state->dts_aggid_arena);
11136 #endif
11137
11138         if (aggid - 1 >= state->dts_naggregations) {
11139                 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11140                 dtrace_aggregation_t **aggs;
11141                 int naggs = state->dts_naggregations << 1;
11142                 int onaggs = state->dts_naggregations;
11143
11144                 ASSERT(aggid == state->dts_naggregations + 1);
11145
11146                 if (naggs == 0) {
11147                         ASSERT(oaggs == NULL);
11148                         naggs = 1;
11149                 }
11150
11151                 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11152
11153                 if (oaggs != NULL) {
11154                         bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11155                         kmem_free(oaggs, onaggs * sizeof (*aggs));
11156                 }
11157
11158                 state->dts_aggregations = aggs;
11159                 state->dts_naggregations = naggs;
11160         }
11161
11162         ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11163         state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11164
11165         frec = &agg->dtag_first->dta_rec;
11166         if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11167                 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11168
11169         for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11170                 ASSERT(!act->dta_intuple);
11171                 act->dta_intuple = 1;
11172         }
11173
11174         return (&agg->dtag_action);
11175 }
11176
11177 static void
11178 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11179 {
11180         dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11181         dtrace_state_t *state = ecb->dte_state;
11182         dtrace_aggid_t aggid = agg->dtag_id;
11183
11184         ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11185 #if defined(sun)
11186         vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11187 #else
11188         free_unr(state->dts_aggid_arena, aggid);
11189 #endif
11190
11191         ASSERT(state->dts_aggregations[aggid - 1] == agg);
11192         state->dts_aggregations[aggid - 1] = NULL;
11193
11194         kmem_free(agg, sizeof (dtrace_aggregation_t));
11195 }
11196
11197 static int
11198 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11199 {
11200         dtrace_action_t *action, *last;
11201         dtrace_difo_t *dp = desc->dtad_difo;
11202         uint32_t size = 0, align = sizeof (uint8_t), mask;
11203         uint16_t format = 0;
11204         dtrace_recdesc_t *rec;
11205         dtrace_state_t *state = ecb->dte_state;
11206         dtrace_optval_t *opt = state->dts_options, nframes = 0, strsize;
11207         uint64_t arg = desc->dtad_arg;
11208
11209         ASSERT(MUTEX_HELD(&dtrace_lock));
11210         ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11211
11212         if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11213                 /*
11214                  * If this is an aggregating action, there must be neither
11215                  * a speculate nor a commit on the action chain.
11216                  */
11217                 dtrace_action_t *act;
11218
11219                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11220                         if (act->dta_kind == DTRACEACT_COMMIT)
11221                                 return (EINVAL);
11222
11223                         if (act->dta_kind == DTRACEACT_SPECULATE)
11224                                 return (EINVAL);
11225                 }
11226
11227                 action = dtrace_ecb_aggregation_create(ecb, desc);
11228
11229                 if (action == NULL)
11230                         return (EINVAL);
11231         } else {
11232                 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11233                     (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11234                     dp != NULL && dp->dtdo_destructive)) {
11235                         state->dts_destructive = 1;
11236                 }
11237
11238                 switch (desc->dtad_kind) {
11239                 case DTRACEACT_PRINTF:
11240                 case DTRACEACT_PRINTA:
11241                 case DTRACEACT_SYSTEM:
11242                 case DTRACEACT_FREOPEN:
11243                 case DTRACEACT_DIFEXPR:
11244                         /*
11245                          * We know that our arg is a string -- turn it into a
11246                          * format.
11247                          */
11248                         if (arg == 0) {
11249                                 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11250                                     desc->dtad_kind == DTRACEACT_DIFEXPR);
11251                                 format = 0;
11252                         } else {
11253                                 ASSERT(arg != 0);
11254 #if defined(sun)
11255                                 ASSERT(arg > KERNELBASE);
11256 #endif
11257                                 format = dtrace_format_add(state,
11258                                     (char *)(uintptr_t)arg);
11259                         }
11260
11261                         /*FALLTHROUGH*/
11262                 case DTRACEACT_LIBACT:
11263                 case DTRACEACT_TRACEMEM:
11264                 case DTRACEACT_TRACEMEM_DYNSIZE:
11265                         if (dp == NULL)
11266                                 return (EINVAL);
11267
11268                         if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11269                                 break;
11270
11271                         if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11272                                 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11273                                         return (EINVAL);
11274
11275                                 size = opt[DTRACEOPT_STRSIZE];
11276                         }
11277
11278                         break;
11279
11280                 case DTRACEACT_STACK:
11281                         if ((nframes = arg) == 0) {
11282                                 nframes = opt[DTRACEOPT_STACKFRAMES];
11283                                 ASSERT(nframes > 0);
11284                                 arg = nframes;
11285                         }
11286
11287                         size = nframes * sizeof (pc_t);
11288                         break;
11289
11290                 case DTRACEACT_JSTACK:
11291                         if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11292                                 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11293
11294                         if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11295                                 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11296
11297                         arg = DTRACE_USTACK_ARG(nframes, strsize);
11298
11299                         /*FALLTHROUGH*/
11300                 case DTRACEACT_USTACK:
11301                         if (desc->dtad_kind != DTRACEACT_JSTACK &&
11302                             (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11303                                 strsize = DTRACE_USTACK_STRSIZE(arg);
11304                                 nframes = opt[DTRACEOPT_USTACKFRAMES];
11305                                 ASSERT(nframes > 0);
11306                                 arg = DTRACE_USTACK_ARG(nframes, strsize);
11307                         }
11308
11309                         /*
11310                          * Save a slot for the pid.
11311                          */
11312                         size = (nframes + 1) * sizeof (uint64_t);
11313                         size += DTRACE_USTACK_STRSIZE(arg);
11314                         size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11315
11316                         break;
11317
11318                 case DTRACEACT_SYM:
11319                 case DTRACEACT_MOD:
11320                         if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11321                             sizeof (uint64_t)) ||
11322                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11323                                 return (EINVAL);
11324                         break;
11325
11326                 case DTRACEACT_USYM:
11327                 case DTRACEACT_UMOD:
11328                 case DTRACEACT_UADDR:
11329                         if (dp == NULL ||
11330                             (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11331                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11332                                 return (EINVAL);
11333
11334                         /*
11335                          * We have a slot for the pid, plus a slot for the
11336                          * argument.  To keep things simple (aligned with
11337                          * bitness-neutral sizing), we store each as a 64-bit
11338                          * quantity.
11339                          */
11340                         size = 2 * sizeof (uint64_t);
11341                         break;
11342
11343                 case DTRACEACT_STOP:
11344                 case DTRACEACT_BREAKPOINT:
11345                 case DTRACEACT_PANIC:
11346                         break;
11347
11348                 case DTRACEACT_CHILL:
11349                 case DTRACEACT_DISCARD:
11350                 case DTRACEACT_RAISE:
11351                         if (dp == NULL)
11352                                 return (EINVAL);
11353                         break;
11354
11355                 case DTRACEACT_EXIT:
11356                         if (dp == NULL ||
11357                             (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11358                             (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11359                                 return (EINVAL);
11360                         break;
11361
11362                 case DTRACEACT_SPECULATE:
11363                         if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11364                                 return (EINVAL);
11365
11366                         if (dp == NULL)
11367                                 return (EINVAL);
11368
11369                         state->dts_speculates = 1;
11370                         break;
11371
11372                 case DTRACEACT_PRINTM:
11373                         size = dp->dtdo_rtype.dtdt_size;
11374                         break;
11375
11376                 case DTRACEACT_PRINTT:
11377                         size = dp->dtdo_rtype.dtdt_size;
11378                         break;
11379
11380                 case DTRACEACT_COMMIT: {
11381                         dtrace_action_t *act = ecb->dte_action;
11382
11383                         for (; act != NULL; act = act->dta_next) {
11384                                 if (act->dta_kind == DTRACEACT_COMMIT)
11385                                         return (EINVAL);
11386                         }
11387
11388                         if (dp == NULL)
11389                                 return (EINVAL);
11390                         break;
11391                 }
11392
11393                 default:
11394                         return (EINVAL);
11395                 }
11396
11397                 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11398                         /*
11399                          * If this is a data-storing action or a speculate,
11400                          * we must be sure that there isn't a commit on the
11401                          * action chain.
11402                          */
11403                         dtrace_action_t *act = ecb->dte_action;
11404
11405                         for (; act != NULL; act = act->dta_next) {
11406                                 if (act->dta_kind == DTRACEACT_COMMIT)
11407                                         return (EINVAL);
11408                         }
11409                 }
11410
11411                 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11412                 action->dta_rec.dtrd_size = size;
11413         }
11414
11415         action->dta_refcnt = 1;
11416         rec = &action->dta_rec;
11417         size = rec->dtrd_size;
11418
11419         for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11420                 if (!(size & mask)) {
11421                         align = mask + 1;
11422                         break;
11423                 }
11424         }
11425
11426         action->dta_kind = desc->dtad_kind;
11427
11428         if ((action->dta_difo = dp) != NULL)
11429                 dtrace_difo_hold(dp);
11430
11431         rec->dtrd_action = action->dta_kind;
11432         rec->dtrd_arg = arg;
11433         rec->dtrd_uarg = desc->dtad_uarg;
11434         rec->dtrd_alignment = (uint16_t)align;
11435         rec->dtrd_format = format;
11436
11437         if ((last = ecb->dte_action_last) != NULL) {
11438                 ASSERT(ecb->dte_action != NULL);
11439                 action->dta_prev = last;
11440                 last->dta_next = action;
11441         } else {
11442                 ASSERT(ecb->dte_action == NULL);
11443                 ecb->dte_action = action;
11444         }
11445
11446         ecb->dte_action_last = action;
11447
11448         return (0);
11449 }
11450
11451 static void
11452 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11453 {
11454         dtrace_action_t *act = ecb->dte_action, *next;
11455         dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11456         dtrace_difo_t *dp;
11457         uint16_t format;
11458
11459         if (act != NULL && act->dta_refcnt > 1) {
11460                 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11461                 act->dta_refcnt--;
11462         } else {
11463                 for (; act != NULL; act = next) {
11464                         next = act->dta_next;
11465                         ASSERT(next != NULL || act == ecb->dte_action_last);
11466                         ASSERT(act->dta_refcnt == 1);
11467
11468                         if ((format = act->dta_rec.dtrd_format) != 0)
11469                                 dtrace_format_remove(ecb->dte_state, format);
11470
11471                         if ((dp = act->dta_difo) != NULL)
11472                                 dtrace_difo_release(dp, vstate);
11473
11474                         if (DTRACEACT_ISAGG(act->dta_kind)) {
11475                                 dtrace_ecb_aggregation_destroy(ecb, act);
11476                         } else {
11477                                 kmem_free(act, sizeof (dtrace_action_t));
11478                         }
11479                 }
11480         }
11481
11482         ecb->dte_action = NULL;
11483         ecb->dte_action_last = NULL;
11484         ecb->dte_size = 0;
11485 }
11486
11487 static void
11488 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11489 {
11490         /*
11491          * We disable the ECB by removing it from its probe.
11492          */
11493         dtrace_ecb_t *pecb, *prev = NULL;
11494         dtrace_probe_t *probe = ecb->dte_probe;
11495
11496         ASSERT(MUTEX_HELD(&dtrace_lock));
11497
11498         if (probe == NULL) {
11499                 /*
11500                  * This is the NULL probe; there is nothing to disable.
11501                  */
11502                 return;
11503         }
11504
11505         for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11506                 if (pecb == ecb)
11507                         break;
11508                 prev = pecb;
11509         }
11510
11511         ASSERT(pecb != NULL);
11512
11513         if (prev == NULL) {
11514                 probe->dtpr_ecb = ecb->dte_next;
11515         } else {
11516                 prev->dte_next = ecb->dte_next;
11517         }
11518
11519         if (ecb == probe->dtpr_ecb_last) {
11520                 ASSERT(ecb->dte_next == NULL);
11521                 probe->dtpr_ecb_last = prev;
11522         }
11523
11524         /*
11525          * The ECB has been disconnected from the probe; now sync to assure
11526          * that all CPUs have seen the change before returning.
11527          */
11528         dtrace_sync();
11529
11530         if (probe->dtpr_ecb == NULL) {
11531                 /*
11532                  * That was the last ECB on the probe; clear the predicate
11533                  * cache ID for the probe, disable it and sync one more time
11534                  * to assure that we'll never hit it again.
11535                  */
11536                 dtrace_provider_t *prov = probe->dtpr_provider;
11537
11538                 ASSERT(ecb->dte_next == NULL);
11539                 ASSERT(probe->dtpr_ecb_last == NULL);
11540                 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11541                 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11542                     probe->dtpr_id, probe->dtpr_arg);
11543                 dtrace_sync();
11544         } else {
11545                 /*
11546                  * There is at least one ECB remaining on the probe.  If there
11547                  * is _exactly_ one, set the probe's predicate cache ID to be
11548                  * the predicate cache ID of the remaining ECB.
11549                  */
11550                 ASSERT(probe->dtpr_ecb_last != NULL);
11551                 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11552
11553                 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11554                         dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11555
11556                         ASSERT(probe->dtpr_ecb->dte_next == NULL);
11557
11558                         if (p != NULL)
11559                                 probe->dtpr_predcache = p->dtp_cacheid;
11560                 }
11561
11562                 ecb->dte_next = NULL;
11563         }
11564 }
11565
11566 static void
11567 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11568 {
11569         dtrace_state_t *state = ecb->dte_state;
11570         dtrace_vstate_t *vstate = &state->dts_vstate;
11571         dtrace_predicate_t *pred;
11572         dtrace_epid_t epid = ecb->dte_epid;
11573
11574         ASSERT(MUTEX_HELD(&dtrace_lock));
11575         ASSERT(ecb->dte_next == NULL);
11576         ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11577
11578         if ((pred = ecb->dte_predicate) != NULL)
11579                 dtrace_predicate_release(pred, vstate);
11580
11581         dtrace_ecb_action_remove(ecb);
11582
11583         ASSERT(state->dts_ecbs[epid - 1] == ecb);
11584         state->dts_ecbs[epid - 1] = NULL;
11585
11586         kmem_free(ecb, sizeof (dtrace_ecb_t));
11587 }
11588
11589 static dtrace_ecb_t *
11590 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11591     dtrace_enabling_t *enab)
11592 {
11593         dtrace_ecb_t *ecb;
11594         dtrace_predicate_t *pred;
11595         dtrace_actdesc_t *act;
11596         dtrace_provider_t *prov;
11597         dtrace_ecbdesc_t *desc = enab->dten_current;
11598
11599         ASSERT(MUTEX_HELD(&dtrace_lock));
11600         ASSERT(state != NULL);
11601
11602         ecb = dtrace_ecb_add(state, probe);
11603         ecb->dte_uarg = desc->dted_uarg;
11604
11605         if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11606                 dtrace_predicate_hold(pred);
11607                 ecb->dte_predicate = pred;
11608         }
11609
11610         if (probe != NULL) {
11611                 /*
11612                  * If the provider shows more leg than the consumer is old
11613                  * enough to see, we need to enable the appropriate implicit
11614                  * predicate bits to prevent the ecb from activating at
11615                  * revealing times.
11616                  *
11617                  * Providers specifying DTRACE_PRIV_USER at register time
11618                  * are stating that they need the /proc-style privilege
11619                  * model to be enforced, and this is what DTRACE_COND_OWNER
11620                  * and DTRACE_COND_ZONEOWNER will then do at probe time.
11621                  */
11622                 prov = probe->dtpr_provider;
11623                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11624                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11625                         ecb->dte_cond |= DTRACE_COND_OWNER;
11626
11627                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11628                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11629                         ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11630
11631                 /*
11632                  * If the provider shows us kernel innards and the user
11633                  * is lacking sufficient privilege, enable the
11634                  * DTRACE_COND_USERMODE implicit predicate.
11635                  */
11636                 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11637                     (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11638                         ecb->dte_cond |= DTRACE_COND_USERMODE;
11639         }
11640
11641         if (dtrace_ecb_create_cache != NULL) {
11642                 /*
11643                  * If we have a cached ecb, we'll use its action list instead
11644                  * of creating our own (saving both time and space).
11645                  */
11646                 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11647                 dtrace_action_t *act = cached->dte_action;
11648
11649                 if (act != NULL) {
11650                         ASSERT(act->dta_refcnt > 0);
11651                         act->dta_refcnt++;
11652                         ecb->dte_action = act;
11653                         ecb->dte_action_last = cached->dte_action_last;
11654                         ecb->dte_needed = cached->dte_needed;
11655                         ecb->dte_size = cached->dte_size;
11656                         ecb->dte_alignment = cached->dte_alignment;
11657                 }
11658
11659                 return (ecb);
11660         }
11661
11662         for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11663                 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11664                         dtrace_ecb_destroy(ecb);
11665                         return (NULL);
11666                 }
11667         }
11668
11669         dtrace_ecb_resize(ecb);
11670
11671         return (dtrace_ecb_create_cache = ecb);
11672 }
11673
11674 static int
11675 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
11676 {
11677         dtrace_ecb_t *ecb;
11678         dtrace_enabling_t *enab = arg;
11679         dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11680
11681         ASSERT(state != NULL);
11682
11683         if (probe != NULL && probe->dtpr_gen < enab->dten_probegen) {
11684                 /*
11685                  * This probe was created in a generation for which this
11686                  * enabling has previously created ECBs; we don't want to
11687                  * enable it again, so just kick out.
11688                  */
11689                 return (DTRACE_MATCH_NEXT);
11690         }
11691
11692         if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11693                 return (DTRACE_MATCH_DONE);
11694
11695         dtrace_ecb_enable(ecb);
11696         return (DTRACE_MATCH_NEXT);
11697 }
11698
11699 static dtrace_ecb_t *
11700 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11701 {
11702         dtrace_ecb_t *ecb;
11703
11704         ASSERT(MUTEX_HELD(&dtrace_lock));
11705
11706         if (id == 0 || id > state->dts_necbs)
11707                 return (NULL);
11708
11709         ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11710         ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11711
11712         return (state->dts_ecbs[id - 1]);
11713 }
11714
11715 static dtrace_aggregation_t *
11716 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11717 {
11718         dtrace_aggregation_t *agg;
11719
11720         ASSERT(MUTEX_HELD(&dtrace_lock));
11721
11722         if (id == 0 || id > state->dts_naggregations)
11723                 return (NULL);
11724
11725         ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11726         ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11727             agg->dtag_id == id);
11728
11729         return (state->dts_aggregations[id - 1]);
11730 }
11731
11732 /*
11733  * DTrace Buffer Functions
11734  *
11735  * The following functions manipulate DTrace buffers.  Most of these functions
11736  * are called in the context of establishing or processing consumer state;
11737  * exceptions are explicitly noted.
11738  */
11739
11740 /*
11741  * Note:  called from cross call context.  This function switches the two
11742  * buffers on a given CPU.  The atomicity of this operation is assured by
11743  * disabling interrupts while the actual switch takes place; the disabling of
11744  * interrupts serializes the execution with any execution of dtrace_probe() on
11745  * the same CPU.
11746  */
11747 static void
11748 dtrace_buffer_switch(dtrace_buffer_t *buf)
11749 {
11750         caddr_t tomax = buf->dtb_tomax;
11751         caddr_t xamot = buf->dtb_xamot;
11752         dtrace_icookie_t cookie;
11753         hrtime_t now;
11754
11755         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11756         ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11757
11758         cookie = dtrace_interrupt_disable();
11759         now = dtrace_gethrtime();
11760         buf->dtb_tomax = xamot;
11761         buf->dtb_xamot = tomax;
11762         buf->dtb_xamot_drops = buf->dtb_drops;
11763         buf->dtb_xamot_offset = buf->dtb_offset;
11764         buf->dtb_xamot_errors = buf->dtb_errors;
11765         buf->dtb_xamot_flags = buf->dtb_flags;
11766         buf->dtb_offset = 0;
11767         buf->dtb_drops = 0;
11768         buf->dtb_errors = 0;
11769         buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11770         buf->dtb_interval = now - buf->dtb_switched;
11771         buf->dtb_switched = now;
11772         dtrace_interrupt_enable(cookie);
11773 }
11774
11775 /*
11776  * Note:  called from cross call context.  This function activates a buffer
11777  * on a CPU.  As with dtrace_buffer_switch(), the atomicity of the operation
11778  * is guaranteed by the disabling of interrupts.
11779  */
11780 static void
11781 dtrace_buffer_activate(dtrace_state_t *state)
11782 {
11783         dtrace_buffer_t *buf;
11784         dtrace_icookie_t cookie = dtrace_interrupt_disable();
11785
11786         buf = &state->dts_buffer[curcpu];
11787
11788         if (buf->dtb_tomax != NULL) {
11789                 /*
11790                  * We might like to assert that the buffer is marked inactive,
11791                  * but this isn't necessarily true:  the buffer for the CPU
11792                  * that processes the BEGIN probe has its buffer activated
11793                  * manually.  In this case, we take the (harmless) action
11794                  * re-clearing the bit INACTIVE bit.
11795                  */
11796                 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11797         }
11798
11799         dtrace_interrupt_enable(cookie);
11800 }
11801
11802 static int
11803 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
11804     processorid_t cpu, int *factor)
11805 {
11806 #if defined(sun)
11807         cpu_t *cp;
11808 #endif
11809         dtrace_buffer_t *buf;
11810         int allocated = 0, desired = 0;
11811
11812 #if defined(sun)
11813         ASSERT(MUTEX_HELD(&cpu_lock));
11814         ASSERT(MUTEX_HELD(&dtrace_lock));
11815
11816         *factor = 1;
11817
11818         if (size > dtrace_nonroot_maxsize &&
11819             !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
11820                 return (EFBIG);
11821
11822         cp = cpu_list;
11823
11824         do {
11825                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11826                         continue;
11827
11828                 buf = &bufs[cp->cpu_id];
11829
11830                 /*
11831                  * If there is already a buffer allocated for this CPU, it
11832                  * is only possible that this is a DR event.  In this case,
11833                  */
11834                 if (buf->dtb_tomax != NULL) {
11835                         ASSERT(buf->dtb_size == size);
11836                         continue;
11837                 }
11838
11839                 ASSERT(buf->dtb_xamot == NULL);
11840
11841                 if ((buf->dtb_tomax = kmem_zalloc(size,
11842                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11843                         goto err;
11844
11845                 buf->dtb_size = size;
11846                 buf->dtb_flags = flags;
11847                 buf->dtb_offset = 0;
11848                 buf->dtb_drops = 0;
11849
11850                 if (flags & DTRACEBUF_NOSWITCH)
11851                         continue;
11852
11853                 if ((buf->dtb_xamot = kmem_zalloc(size,
11854                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11855                         goto err;
11856         } while ((cp = cp->cpu_next) != cpu_list);
11857
11858         return (0);
11859
11860 err:
11861         cp = cpu_list;
11862
11863         do {
11864                 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
11865                         continue;
11866
11867                 buf = &bufs[cp->cpu_id];
11868                 desired += 2;
11869
11870                 if (buf->dtb_xamot != NULL) {
11871                         ASSERT(buf->dtb_tomax != NULL);
11872                         ASSERT(buf->dtb_size == size);
11873                         kmem_free(buf->dtb_xamot, size);
11874                         allocated++;
11875                 }
11876
11877                 if (buf->dtb_tomax != NULL) {
11878                         ASSERT(buf->dtb_size == size);
11879                         kmem_free(buf->dtb_tomax, size);
11880                         allocated++;
11881                 }
11882
11883                 buf->dtb_tomax = NULL;
11884                 buf->dtb_xamot = NULL;
11885                 buf->dtb_size = 0;
11886         } while ((cp = cp->cpu_next) != cpu_list);
11887 #else
11888         int i;
11889
11890         *factor = 1;
11891 #if defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
11892         /*
11893          * FreeBSD isn't good at limiting the amount of memory we
11894          * ask to malloc, so let's place a limit here before trying
11895          * to do something that might well end in tears at bedtime.
11896          */
11897         if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
11898                 return (ENOMEM);
11899 #endif
11900
11901         ASSERT(MUTEX_HELD(&dtrace_lock));
11902         CPU_FOREACH(i) {
11903                 if (cpu != DTRACE_CPUALL && cpu != i)
11904                         continue;
11905
11906                 buf = &bufs[i];
11907
11908                 /*
11909                  * If there is already a buffer allocated for this CPU, it
11910                  * is only possible that this is a DR event.  In this case,
11911                  * the buffer size must match our specified size.
11912                  */
11913                 if (buf->dtb_tomax != NULL) {
11914                         ASSERT(buf->dtb_size == size);
11915                         continue;
11916                 }
11917
11918                 ASSERT(buf->dtb_xamot == NULL);
11919
11920                 if ((buf->dtb_tomax = kmem_zalloc(size,
11921                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11922                         goto err;
11923
11924                 buf->dtb_size = size;
11925                 buf->dtb_flags = flags;
11926                 buf->dtb_offset = 0;
11927                 buf->dtb_drops = 0;
11928
11929                 if (flags & DTRACEBUF_NOSWITCH)
11930                         continue;
11931
11932                 if ((buf->dtb_xamot = kmem_zalloc(size,
11933                     KM_NOSLEEP | KM_NORMALPRI)) == NULL)
11934                         goto err;
11935         }
11936
11937         return (0);
11938
11939 err:
11940         /*
11941          * Error allocating memory, so free the buffers that were
11942          * allocated before the failed allocation.
11943          */
11944         CPU_FOREACH(i) {
11945                 if (cpu != DTRACE_CPUALL && cpu != i)
11946                         continue;
11947
11948                 buf = &bufs[i];
11949                 desired += 2;
11950
11951                 if (buf->dtb_xamot != NULL) {
11952                         ASSERT(buf->dtb_tomax != NULL);
11953                         ASSERT(buf->dtb_size == size);
11954                         kmem_free(buf->dtb_xamot, size);
11955                         allocated++;
11956                 }
11957
11958                 if (buf->dtb_tomax != NULL) {
11959                         ASSERT(buf->dtb_size == size);
11960                         kmem_free(buf->dtb_tomax, size);
11961                         allocated++;
11962                 }
11963
11964                 buf->dtb_tomax = NULL;
11965                 buf->dtb_xamot = NULL;
11966                 buf->dtb_size = 0;
11967
11968         }
11969 #endif
11970         *factor = desired / (allocated > 0 ? allocated : 1);
11971
11972         return (ENOMEM);
11973 }
11974
11975 /*
11976  * Note:  called from probe context.  This function just increments the drop
11977  * count on a buffer.  It has been made a function to allow for the
11978  * possibility of understanding the source of mysterious drop counts.  (A
11979  * problem for which one may be particularly disappointed that DTrace cannot
11980  * be used to understand DTrace.)
11981  */
11982 static void
11983 dtrace_buffer_drop(dtrace_buffer_t *buf)
11984 {
11985         buf->dtb_drops++;
11986 }
11987
11988 /*
11989  * Note:  called from probe context.  This function is called to reserve space
11990  * in a buffer.  If mstate is non-NULL, sets the scratch base and size in the
11991  * mstate.  Returns the new offset in the buffer, or a negative value if an
11992  * error has occurred.
11993  */
11994 static intptr_t
11995 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
11996     dtrace_state_t *state, dtrace_mstate_t *mstate)
11997 {
11998         intptr_t offs = buf->dtb_offset, soffs;
11999         intptr_t woffs;
12000         caddr_t tomax;
12001         size_t total;
12002
12003         if (buf->dtb_flags & DTRACEBUF_INACTIVE)
12004                 return (-1);
12005
12006         if ((tomax = buf->dtb_tomax) == NULL) {
12007                 dtrace_buffer_drop(buf);
12008                 return (-1);
12009         }
12010
12011         if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12012                 while (offs & (align - 1)) {
12013                         /*
12014                          * Assert that our alignment is off by a number which
12015                          * is itself sizeof (uint32_t) aligned.
12016                          */
12017                         ASSERT(!((align - (offs & (align - 1))) &
12018                             (sizeof (uint32_t) - 1)));
12019                         DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12020                         offs += sizeof (uint32_t);
12021                 }
12022
12023                 if ((soffs = offs + needed) > buf->dtb_size) {
12024                         dtrace_buffer_drop(buf);
12025                         return (-1);
12026                 }
12027
12028                 if (mstate == NULL)
12029                         return (offs);
12030
12031                 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12032                 mstate->dtms_scratch_size = buf->dtb_size - soffs;
12033                 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12034
12035                 return (offs);
12036         }
12037
12038         if (buf->dtb_flags & DTRACEBUF_FILL) {
12039                 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12040                     (buf->dtb_flags & DTRACEBUF_FULL))
12041                         return (-1);
12042                 goto out;
12043         }
12044
12045         total = needed + (offs & (align - 1));
12046
12047         /*
12048          * For a ring buffer, life is quite a bit more complicated.  Before
12049          * we can store any padding, we need to adjust our wrapping offset.
12050          * (If we've never before wrapped or we're not about to, no adjustment
12051          * is required.)
12052          */
12053         if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12054             offs + total > buf->dtb_size) {
12055                 woffs = buf->dtb_xamot_offset;
12056
12057                 if (offs + total > buf->dtb_size) {
12058                         /*
12059                          * We can't fit in the end of the buffer.  First, a
12060                          * sanity check that we can fit in the buffer at all.
12061                          */
12062                         if (total > buf->dtb_size) {
12063                                 dtrace_buffer_drop(buf);
12064                                 return (-1);
12065                         }
12066
12067                         /*
12068                          * We're going to be storing at the top of the buffer,
12069                          * so now we need to deal with the wrapped offset.  We
12070                          * only reset our wrapped offset to 0 if it is
12071                          * currently greater than the current offset.  If it
12072                          * is less than the current offset, it is because a
12073                          * previous allocation induced a wrap -- but the
12074                          * allocation didn't subsequently take the space due
12075                          * to an error or false predicate evaluation.  In this
12076                          * case, we'll just leave the wrapped offset alone: if
12077                          * the wrapped offset hasn't been advanced far enough
12078                          * for this allocation, it will be adjusted in the
12079                          * lower loop.
12080                          */
12081                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12082                                 if (woffs >= offs)
12083                                         woffs = 0;
12084                         } else {
12085                                 woffs = 0;
12086                         }
12087
12088                         /*
12089                          * Now we know that we're going to be storing to the
12090                          * top of the buffer and that there is room for us
12091                          * there.  We need to clear the buffer from the current
12092                          * offset to the end (there may be old gunk there).
12093                          */
12094                         while (offs < buf->dtb_size)
12095                                 tomax[offs++] = 0;
12096
12097                         /*
12098                          * We need to set our offset to zero.  And because we
12099                          * are wrapping, we need to set the bit indicating as
12100                          * much.  We can also adjust our needed space back
12101                          * down to the space required by the ECB -- we know
12102                          * that the top of the buffer is aligned.
12103                          */
12104                         offs = 0;
12105                         total = needed;
12106                         buf->dtb_flags |= DTRACEBUF_WRAPPED;
12107                 } else {
12108                         /*
12109                          * There is room for us in the buffer, so we simply
12110                          * need to check the wrapped offset.
12111                          */
12112                         if (woffs < offs) {
12113                                 /*
12114                                  * The wrapped offset is less than the offset.
12115                                  * This can happen if we allocated buffer space
12116                                  * that induced a wrap, but then we didn't
12117                                  * subsequently take the space due to an error
12118                                  * or false predicate evaluation.  This is
12119                                  * okay; we know that _this_ allocation isn't
12120                                  * going to induce a wrap.  We still can't
12121                                  * reset the wrapped offset to be zero,
12122                                  * however: the space may have been trashed in
12123                                  * the previous failed probe attempt.  But at
12124                                  * least the wrapped offset doesn't need to
12125                                  * be adjusted at all...
12126                                  */
12127                                 goto out;
12128                         }
12129                 }
12130
12131                 while (offs + total > woffs) {
12132                         dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12133                         size_t size;
12134
12135                         if (epid == DTRACE_EPIDNONE) {
12136                                 size = sizeof (uint32_t);
12137                         } else {
12138                                 ASSERT3U(epid, <=, state->dts_necbs);
12139                                 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12140
12141                                 size = state->dts_ecbs[epid - 1]->dte_size;
12142                         }
12143
12144                         ASSERT(woffs + size <= buf->dtb_size);
12145                         ASSERT(size != 0);
12146
12147                         if (woffs + size == buf->dtb_size) {
12148                                 /*
12149                                  * We've reached the end of the buffer; we want
12150                                  * to set the wrapped offset to 0 and break
12151                                  * out.  However, if the offs is 0, then we're
12152                                  * in a strange edge-condition:  the amount of
12153                                  * space that we want to reserve plus the size
12154                                  * of the record that we're overwriting is
12155                                  * greater than the size of the buffer.  This
12156                                  * is problematic because if we reserve the
12157                                  * space but subsequently don't consume it (due
12158                                  * to a failed predicate or error) the wrapped
12159                                  * offset will be 0 -- yet the EPID at offset 0
12160                                  * will not be committed.  This situation is
12161                                  * relatively easy to deal with:  if we're in
12162                                  * this case, the buffer is indistinguishable
12163                                  * from one that hasn't wrapped; we need only
12164                                  * finish the job by clearing the wrapped bit,
12165                                  * explicitly setting the offset to be 0, and
12166                                  * zero'ing out the old data in the buffer.
12167                                  */
12168                                 if (offs == 0) {
12169                                         buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12170                                         buf->dtb_offset = 0;
12171                                         woffs = total;
12172
12173                                         while (woffs < buf->dtb_size)
12174                                                 tomax[woffs++] = 0;
12175                                 }
12176
12177                                 woffs = 0;
12178                                 break;
12179                         }
12180
12181                         woffs += size;
12182                 }
12183
12184                 /*
12185                  * We have a wrapped offset.  It may be that the wrapped offset
12186                  * has become zero -- that's okay.
12187                  */
12188                 buf->dtb_xamot_offset = woffs;
12189         }
12190
12191 out:
12192         /*
12193          * Now we can plow the buffer with any necessary padding.
12194          */
12195         while (offs & (align - 1)) {
12196                 /*
12197                  * Assert that our alignment is off by a number which
12198                  * is itself sizeof (uint32_t) aligned.
12199                  */
12200                 ASSERT(!((align - (offs & (align - 1))) &
12201                     (sizeof (uint32_t) - 1)));
12202                 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12203                 offs += sizeof (uint32_t);
12204         }
12205
12206         if (buf->dtb_flags & DTRACEBUF_FILL) {
12207                 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12208                         buf->dtb_flags |= DTRACEBUF_FULL;
12209                         return (-1);
12210                 }
12211         }
12212
12213         if (mstate == NULL)
12214                 return (offs);
12215
12216         /*
12217          * For ring buffers and fill buffers, the scratch space is always
12218          * the inactive buffer.
12219          */
12220         mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12221         mstate->dtms_scratch_size = buf->dtb_size;
12222         mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12223
12224         return (offs);
12225 }
12226
12227 static void
12228 dtrace_buffer_polish(dtrace_buffer_t *buf)
12229 {
12230         ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12231         ASSERT(MUTEX_HELD(&dtrace_lock));
12232
12233         if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12234                 return;
12235
12236         /*
12237          * We need to polish the ring buffer.  There are three cases:
12238          *
12239          * - The first (and presumably most common) is that there is no gap
12240          *   between the buffer offset and the wrapped offset.  In this case,
12241          *   there is nothing in the buffer that isn't valid data; we can
12242          *   mark the buffer as polished and return.
12243          *
12244          * - The second (less common than the first but still more common
12245          *   than the third) is that there is a gap between the buffer offset
12246          *   and the wrapped offset, and the wrapped offset is larger than the
12247          *   buffer offset.  This can happen because of an alignment issue, or
12248          *   can happen because of a call to dtrace_buffer_reserve() that
12249          *   didn't subsequently consume the buffer space.  In this case,
12250          *   we need to zero the data from the buffer offset to the wrapped
12251          *   offset.
12252          *
12253          * - The third (and least common) is that there is a gap between the
12254          *   buffer offset and the wrapped offset, but the wrapped offset is
12255          *   _less_ than the buffer offset.  This can only happen because a
12256          *   call to dtrace_buffer_reserve() induced a wrap, but the space
12257          *   was not subsequently consumed.  In this case, we need to zero the
12258          *   space from the offset to the end of the buffer _and_ from the
12259          *   top of the buffer to the wrapped offset.
12260          */
12261         if (buf->dtb_offset < buf->dtb_xamot_offset) {
12262                 bzero(buf->dtb_tomax + buf->dtb_offset,
12263                     buf->dtb_xamot_offset - buf->dtb_offset);
12264         }
12265
12266         if (buf->dtb_offset > buf->dtb_xamot_offset) {
12267                 bzero(buf->dtb_tomax + buf->dtb_offset,
12268                     buf->dtb_size - buf->dtb_offset);
12269                 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12270         }
12271 }
12272
12273 /*
12274  * This routine determines if data generated at the specified time has likely
12275  * been entirely consumed at user-level.  This routine is called to determine
12276  * if an ECB on a defunct probe (but for an active enabling) can be safely
12277  * disabled and destroyed.
12278  */
12279 static int
12280 dtrace_buffer_consumed(dtrace_buffer_t *bufs, hrtime_t when)
12281 {
12282         int i;
12283
12284         for (i = 0; i < NCPU; i++) {
12285                 dtrace_buffer_t *buf = &bufs[i];
12286
12287                 if (buf->dtb_size == 0)
12288                         continue;
12289
12290                 if (buf->dtb_flags & DTRACEBUF_RING)
12291                         return (0);
12292
12293                 if (!buf->dtb_switched && buf->dtb_offset != 0)
12294                         return (0);
12295
12296                 if (buf->dtb_switched - buf->dtb_interval < when)
12297                         return (0);
12298         }
12299
12300         return (1);
12301 }
12302
12303 static void
12304 dtrace_buffer_free(dtrace_buffer_t *bufs)
12305 {
12306         int i;
12307
12308         for (i = 0; i < NCPU; i++) {
12309                 dtrace_buffer_t *buf = &bufs[i];
12310
12311                 if (buf->dtb_tomax == NULL) {
12312                         ASSERT(buf->dtb_xamot == NULL);
12313                         ASSERT(buf->dtb_size == 0);
12314                         continue;
12315                 }
12316
12317                 if (buf->dtb_xamot != NULL) {
12318                         ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12319                         kmem_free(buf->dtb_xamot, buf->dtb_size);
12320                 }
12321
12322                 kmem_free(buf->dtb_tomax, buf->dtb_size);
12323                 buf->dtb_size = 0;
12324                 buf->dtb_tomax = NULL;
12325                 buf->dtb_xamot = NULL;
12326         }
12327 }
12328
12329 /*
12330  * DTrace Enabling Functions
12331  */
12332 static dtrace_enabling_t *
12333 dtrace_enabling_create(dtrace_vstate_t *vstate)
12334 {
12335         dtrace_enabling_t *enab;
12336
12337         enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12338         enab->dten_vstate = vstate;
12339
12340         return (enab);
12341 }
12342
12343 static void
12344 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12345 {
12346         dtrace_ecbdesc_t **ndesc;
12347         size_t osize, nsize;
12348
12349         /*
12350          * We can't add to enablings after we've enabled them, or after we've
12351          * retained them.
12352          */
12353         ASSERT(enab->dten_probegen == 0);
12354         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12355
12356         if (enab->dten_ndesc < enab->dten_maxdesc) {
12357                 enab->dten_desc[enab->dten_ndesc++] = ecb;
12358                 return;
12359         }
12360
12361         osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12362
12363         if (enab->dten_maxdesc == 0) {
12364                 enab->dten_maxdesc = 1;
12365         } else {
12366                 enab->dten_maxdesc <<= 1;
12367         }
12368
12369         ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12370
12371         nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12372         ndesc = kmem_zalloc(nsize, KM_SLEEP);
12373         bcopy(enab->dten_desc, ndesc, osize);
12374         if (enab->dten_desc != NULL)
12375                 kmem_free(enab->dten_desc, osize);
12376
12377         enab->dten_desc = ndesc;
12378         enab->dten_desc[enab->dten_ndesc++] = ecb;
12379 }
12380
12381 static void
12382 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12383     dtrace_probedesc_t *pd)
12384 {
12385         dtrace_ecbdesc_t *new;
12386         dtrace_predicate_t *pred;
12387         dtrace_actdesc_t *act;
12388
12389         /*
12390          * We're going to create a new ECB description that matches the
12391          * specified ECB in every way, but has the specified probe description.
12392          */
12393         new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12394
12395         if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12396                 dtrace_predicate_hold(pred);
12397
12398         for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12399                 dtrace_actdesc_hold(act);
12400
12401         new->dted_action = ecb->dted_action;
12402         new->dted_pred = ecb->dted_pred;
12403         new->dted_probe = *pd;
12404         new->dted_uarg = ecb->dted_uarg;
12405
12406         dtrace_enabling_add(enab, new);
12407 }
12408
12409 static void
12410 dtrace_enabling_dump(dtrace_enabling_t *enab)
12411 {
12412         int i;
12413
12414         for (i = 0; i < enab->dten_ndesc; i++) {
12415                 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12416
12417                 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12418                     desc->dtpd_provider, desc->dtpd_mod,
12419                     desc->dtpd_func, desc->dtpd_name);
12420         }
12421 }
12422
12423 static void
12424 dtrace_enabling_destroy(dtrace_enabling_t *enab)
12425 {
12426         int i;
12427         dtrace_ecbdesc_t *ep;
12428         dtrace_vstate_t *vstate = enab->dten_vstate;
12429
12430         ASSERT(MUTEX_HELD(&dtrace_lock));
12431
12432         for (i = 0; i < enab->dten_ndesc; i++) {
12433                 dtrace_actdesc_t *act, *next;
12434                 dtrace_predicate_t *pred;
12435
12436                 ep = enab->dten_desc[i];
12437
12438                 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12439                         dtrace_predicate_release(pred, vstate);
12440
12441                 for (act = ep->dted_action; act != NULL; act = next) {
12442                         next = act->dtad_next;
12443                         dtrace_actdesc_release(act, vstate);
12444                 }
12445
12446                 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12447         }
12448
12449         if (enab->dten_desc != NULL)
12450                 kmem_free(enab->dten_desc,
12451                     enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12452
12453         /*
12454          * If this was a retained enabling, decrement the dts_nretained count
12455          * and take it off of the dtrace_retained list.
12456          */
12457         if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12458             dtrace_retained == enab) {
12459                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12460                 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12461                 enab->dten_vstate->dtvs_state->dts_nretained--;
12462                 dtrace_retained_gen++;
12463         }
12464
12465         if (enab->dten_prev == NULL) {
12466                 if (dtrace_retained == enab) {
12467                         dtrace_retained = enab->dten_next;
12468
12469                         if (dtrace_retained != NULL)
12470                                 dtrace_retained->dten_prev = NULL;
12471                 }
12472         } else {
12473                 ASSERT(enab != dtrace_retained);
12474                 ASSERT(dtrace_retained != NULL);
12475                 enab->dten_prev->dten_next = enab->dten_next;
12476         }
12477
12478         if (enab->dten_next != NULL) {
12479                 ASSERT(dtrace_retained != NULL);
12480                 enab->dten_next->dten_prev = enab->dten_prev;
12481         }
12482
12483         kmem_free(enab, sizeof (dtrace_enabling_t));
12484 }
12485
12486 static int
12487 dtrace_enabling_retain(dtrace_enabling_t *enab)
12488 {
12489         dtrace_state_t *state;
12490
12491         ASSERT(MUTEX_HELD(&dtrace_lock));
12492         ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12493         ASSERT(enab->dten_vstate != NULL);
12494
12495         state = enab->dten_vstate->dtvs_state;
12496         ASSERT(state != NULL);
12497
12498         /*
12499          * We only allow each state to retain dtrace_retain_max enablings.
12500          */
12501         if (state->dts_nretained >= dtrace_retain_max)
12502                 return (ENOSPC);
12503
12504         state->dts_nretained++;
12505         dtrace_retained_gen++;
12506
12507         if (dtrace_retained == NULL) {
12508                 dtrace_retained = enab;
12509                 return (0);
12510         }
12511
12512         enab->dten_next = dtrace_retained;
12513         dtrace_retained->dten_prev = enab;
12514         dtrace_retained = enab;
12515
12516         return (0);
12517 }
12518
12519 static int
12520 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12521     dtrace_probedesc_t *create)
12522 {
12523         dtrace_enabling_t *new, *enab;
12524         int found = 0, err = ENOENT;
12525
12526         ASSERT(MUTEX_HELD(&dtrace_lock));
12527         ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12528         ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12529         ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12530         ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12531
12532         new = dtrace_enabling_create(&state->dts_vstate);
12533
12534         /*
12535          * Iterate over all retained enablings, looking for enablings that
12536          * match the specified state.
12537          */
12538         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12539                 int i;
12540
12541                 /*
12542                  * dtvs_state can only be NULL for helper enablings -- and
12543                  * helper enablings can't be retained.
12544                  */
12545                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12546
12547                 if (enab->dten_vstate->dtvs_state != state)
12548                         continue;
12549
12550                 /*
12551                  * Now iterate over each probe description; we're looking for
12552                  * an exact match to the specified probe description.
12553                  */
12554                 for (i = 0; i < enab->dten_ndesc; i++) {
12555                         dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12556                         dtrace_probedesc_t *pd = &ep->dted_probe;
12557
12558                         if (strcmp(pd->dtpd_provider, match->dtpd_provider))
12559                                 continue;
12560
12561                         if (strcmp(pd->dtpd_mod, match->dtpd_mod))
12562                                 continue;
12563
12564                         if (strcmp(pd->dtpd_func, match->dtpd_func))
12565                                 continue;
12566
12567                         if (strcmp(pd->dtpd_name, match->dtpd_name))
12568                                 continue;
12569
12570                         /*
12571                          * We have a winning probe!  Add it to our growing
12572                          * enabling.
12573                          */
12574                         found = 1;
12575                         dtrace_enabling_addlike(new, ep, create);
12576                 }
12577         }
12578
12579         if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12580                 dtrace_enabling_destroy(new);
12581                 return (err);
12582         }
12583
12584         return (0);
12585 }
12586
12587 static void
12588 dtrace_enabling_retract(dtrace_state_t *state)
12589 {
12590         dtrace_enabling_t *enab, *next;
12591
12592         ASSERT(MUTEX_HELD(&dtrace_lock));
12593
12594         /*
12595          * Iterate over all retained enablings, destroy the enablings retained
12596          * for the specified state.
12597          */
12598         for (enab = dtrace_retained; enab != NULL; enab = next) {
12599                 next = enab->dten_next;
12600
12601                 /*
12602                  * dtvs_state can only be NULL for helper enablings -- and
12603                  * helper enablings can't be retained.
12604                  */
12605                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12606
12607                 if (enab->dten_vstate->dtvs_state == state) {
12608                         ASSERT(state->dts_nretained > 0);
12609                         dtrace_enabling_destroy(enab);
12610                 }
12611         }
12612
12613         ASSERT(state->dts_nretained == 0);
12614 }
12615
12616 static int
12617 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
12618 {
12619         int i = 0;
12620         int matched = 0;
12621
12622         ASSERT(MUTEX_HELD(&cpu_lock));
12623         ASSERT(MUTEX_HELD(&dtrace_lock));
12624
12625         for (i = 0; i < enab->dten_ndesc; i++) {
12626                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12627
12628                 enab->dten_current = ep;
12629                 enab->dten_error = 0;
12630
12631                 matched += dtrace_probe_enable(&ep->dted_probe, enab);
12632
12633                 if (enab->dten_error != 0) {
12634                         /*
12635                          * If we get an error half-way through enabling the
12636                          * probes, we kick out -- perhaps with some number of
12637                          * them enabled.  Leaving enabled probes enabled may
12638                          * be slightly confusing for user-level, but we expect
12639                          * that no one will attempt to actually drive on in
12640                          * the face of such errors.  If this is an anonymous
12641                          * enabling (indicated with a NULL nmatched pointer),
12642                          * we cmn_err() a message.  We aren't expecting to
12643                          * get such an error -- such as it can exist at all,
12644                          * it would be a result of corrupted DOF in the driver
12645                          * properties.
12646                          */
12647                         if (nmatched == NULL) {
12648                                 cmn_err(CE_WARN, "dtrace_enabling_match() "
12649                                     "error on %p: %d", (void *)ep,
12650                                     enab->dten_error);
12651                         }
12652
12653                         return (enab->dten_error);
12654                 }
12655         }
12656
12657         enab->dten_probegen = dtrace_probegen;
12658         if (nmatched != NULL)
12659                 *nmatched = matched;
12660
12661         return (0);
12662 }
12663
12664 static void
12665 dtrace_enabling_matchall(void)
12666 {
12667         dtrace_enabling_t *enab;
12668
12669         mutex_enter(&cpu_lock);
12670         mutex_enter(&dtrace_lock);
12671
12672         /*
12673          * Iterate over all retained enablings to see if any probes match
12674          * against them.  We only perform this operation on enablings for which
12675          * we have sufficient permissions by virtue of being in the global zone
12676          * or in the same zone as the DTrace client.  Because we can be called
12677          * after dtrace_detach() has been called, we cannot assert that there
12678          * are retained enablings.  We can safely load from dtrace_retained,
12679          * however:  the taskq_destroy() at the end of dtrace_detach() will
12680          * block pending our completion.
12681          */
12682         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12683 #if defined(sun)
12684                 cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
12685
12686                 if (INGLOBALZONE(curproc) ||
12687                     cr != NULL && getzoneid() == crgetzoneid(cr))
12688 #endif
12689                         (void) dtrace_enabling_match(enab, NULL);
12690         }
12691
12692         mutex_exit(&dtrace_lock);
12693         mutex_exit(&cpu_lock);
12694 }
12695
12696 /*
12697  * If an enabling is to be enabled without having matched probes (that is, if
12698  * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12699  * enabling must be _primed_ by creating an ECB for every ECB description.
12700  * This must be done to assure that we know the number of speculations, the
12701  * number of aggregations, the minimum buffer size needed, etc. before we
12702  * transition out of DTRACE_ACTIVITY_INACTIVE.  To do this without actually
12703  * enabling any probes, we create ECBs for every ECB decription, but with a
12704  * NULL probe -- which is exactly what this function does.
12705  */
12706 static void
12707 dtrace_enabling_prime(dtrace_state_t *state)
12708 {
12709         dtrace_enabling_t *enab;
12710         int i;
12711
12712         for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12713                 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12714
12715                 if (enab->dten_vstate->dtvs_state != state)
12716                         continue;
12717
12718                 /*
12719                  * We don't want to prime an enabling more than once, lest
12720                  * we allow a malicious user to induce resource exhaustion.
12721                  * (The ECBs that result from priming an enabling aren't
12722                  * leaked -- but they also aren't deallocated until the
12723                  * consumer state is destroyed.)
12724                  */
12725                 if (enab->dten_primed)
12726                         continue;
12727
12728                 for (i = 0; i < enab->dten_ndesc; i++) {
12729                         enab->dten_current = enab->dten_desc[i];
12730                         (void) dtrace_probe_enable(NULL, enab);
12731                 }
12732
12733                 enab->dten_primed = 1;
12734         }
12735 }
12736
12737 /*
12738  * Called to indicate that probes should be provided due to retained
12739  * enablings.  This is implemented in terms of dtrace_probe_provide(), but it
12740  * must take an initial lap through the enabling calling the dtps_provide()
12741  * entry point explicitly to allow for autocreated probes.
12742  */
12743 static void
12744 dtrace_enabling_provide(dtrace_provider_t *prv)
12745 {
12746         int i, all = 0;
12747         dtrace_probedesc_t desc;
12748         dtrace_genid_t gen;
12749
12750         ASSERT(MUTEX_HELD(&dtrace_lock));
12751         ASSERT(MUTEX_HELD(&dtrace_provider_lock));
12752
12753         if (prv == NULL) {
12754                 all = 1;
12755                 prv = dtrace_provider;
12756         }
12757
12758         do {
12759                 dtrace_enabling_t *enab;
12760                 void *parg = prv->dtpv_arg;
12761
12762 retry:
12763                 gen = dtrace_retained_gen;
12764                 for (enab = dtrace_retained; enab != NULL;
12765                     enab = enab->dten_next) {
12766                         for (i = 0; i < enab->dten_ndesc; i++) {
12767                                 desc = enab->dten_desc[i]->dted_probe;
12768                                 mutex_exit(&dtrace_lock);
12769                                 prv->dtpv_pops.dtps_provide(parg, &desc);
12770                                 mutex_enter(&dtrace_lock);
12771                                 /*
12772                                  * Process the retained enablings again if
12773                                  * they have changed while we weren't holding
12774                                  * dtrace_lock.
12775                                  */
12776                                 if (gen != dtrace_retained_gen)
12777                                         goto retry;
12778                         }
12779                 }
12780         } while (all && (prv = prv->dtpv_next) != NULL);
12781
12782         mutex_exit(&dtrace_lock);
12783         dtrace_probe_provide(NULL, all ? NULL : prv);
12784         mutex_enter(&dtrace_lock);
12785 }
12786
12787 /*
12788  * Called to reap ECBs that are attached to probes from defunct providers.
12789  */
12790 static void
12791 dtrace_enabling_reap(void)
12792 {
12793         dtrace_provider_t *prov;
12794         dtrace_probe_t *probe;
12795         dtrace_ecb_t *ecb;
12796         hrtime_t when;
12797         int i;
12798
12799         mutex_enter(&cpu_lock);
12800         mutex_enter(&dtrace_lock);
12801
12802         for (i = 0; i < dtrace_nprobes; i++) {
12803                 if ((probe = dtrace_probes[i]) == NULL)
12804                         continue;
12805
12806                 if (probe->dtpr_ecb == NULL)
12807                         continue;
12808
12809                 prov = probe->dtpr_provider;
12810
12811                 if ((when = prov->dtpv_defunct) == 0)
12812                         continue;
12813
12814                 /*
12815                  * We have ECBs on a defunct provider:  we want to reap these
12816                  * ECBs to allow the provider to unregister.  The destruction
12817                  * of these ECBs must be done carefully:  if we destroy the ECB
12818                  * and the consumer later wishes to consume an EPID that
12819                  * corresponds to the destroyed ECB (and if the EPID metadata
12820                  * has not been previously consumed), the consumer will abort
12821                  * processing on the unknown EPID.  To reduce (but not, sadly,
12822                  * eliminate) the possibility of this, we will only destroy an
12823                  * ECB for a defunct provider if, for the state that
12824                  * corresponds to the ECB:
12825                  *
12826                  *  (a) There is no speculative tracing (which can effectively
12827                  *      cache an EPID for an arbitrary amount of time).
12828                  *
12829                  *  (b) The principal buffers have been switched twice since the
12830                  *      provider became defunct.
12831                  *
12832                  *  (c) The aggregation buffers are of zero size or have been
12833                  *      switched twice since the provider became defunct.
12834                  *
12835                  * We use dts_speculates to determine (a) and call a function
12836                  * (dtrace_buffer_consumed()) to determine (b) and (c).  Note
12837                  * that as soon as we've been unable to destroy one of the ECBs
12838                  * associated with the probe, we quit trying -- reaping is only
12839                  * fruitful in as much as we can destroy all ECBs associated
12840                  * with the defunct provider's probes.
12841                  */
12842                 while ((ecb = probe->dtpr_ecb) != NULL) {
12843                         dtrace_state_t *state = ecb->dte_state;
12844                         dtrace_buffer_t *buf = state->dts_buffer;
12845                         dtrace_buffer_t *aggbuf = state->dts_aggbuffer;
12846
12847                         if (state->dts_speculates)
12848                                 break;
12849
12850                         if (!dtrace_buffer_consumed(buf, when))
12851                                 break;
12852
12853                         if (!dtrace_buffer_consumed(aggbuf, when))
12854                                 break;
12855
12856                         dtrace_ecb_disable(ecb);
12857                         ASSERT(probe->dtpr_ecb != ecb);
12858                         dtrace_ecb_destroy(ecb);
12859                 }
12860         }
12861
12862         mutex_exit(&dtrace_lock);
12863         mutex_exit(&cpu_lock);
12864 }
12865
12866 /*
12867  * DTrace DOF Functions
12868  */
12869 /*ARGSUSED*/
12870 static void
12871 dtrace_dof_error(dof_hdr_t *dof, const char *str)
12872 {
12873         if (dtrace_err_verbose)
12874                 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12875
12876 #ifdef DTRACE_ERRDEBUG
12877         dtrace_errdebug(str);
12878 #endif
12879 }
12880
12881 /*
12882  * Create DOF out of a currently enabled state.  Right now, we only create
12883  * DOF containing the run-time options -- but this could be expanded to create
12884  * complete DOF representing the enabled state.
12885  */
12886 static dof_hdr_t *
12887 dtrace_dof_create(dtrace_state_t *state)
12888 {
12889         dof_hdr_t *dof;
12890         dof_sec_t *sec;
12891         dof_optdesc_t *opt;
12892         int i, len = sizeof (dof_hdr_t) +
12893             roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12894             sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12895
12896         ASSERT(MUTEX_HELD(&dtrace_lock));
12897
12898         dof = kmem_zalloc(len, KM_SLEEP);
12899         dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12900         dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12901         dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12902         dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12903
12904         dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12905         dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12906         dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12907         dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12908         dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12909         dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12910
12911         dof->dofh_flags = 0;
12912         dof->dofh_hdrsize = sizeof (dof_hdr_t);
12913         dof->dofh_secsize = sizeof (dof_sec_t);
12914         dof->dofh_secnum = 1;   /* only DOF_SECT_OPTDESC */
12915         dof->dofh_secoff = sizeof (dof_hdr_t);
12916         dof->dofh_loadsz = len;
12917         dof->dofh_filesz = len;
12918         dof->dofh_pad = 0;
12919
12920         /*
12921          * Fill in the option section header...
12922          */
12923         sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12924         sec->dofs_type = DOF_SECT_OPTDESC;
12925         sec->dofs_align = sizeof (uint64_t);
12926         sec->dofs_flags = DOF_SECF_LOAD;
12927         sec->dofs_entsize = sizeof (dof_optdesc_t);
12928
12929         opt = (dof_optdesc_t *)((uintptr_t)sec +
12930             roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
12931
12932         sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
12933         sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12934
12935         for (i = 0; i < DTRACEOPT_MAX; i++) {
12936                 opt[i].dofo_option = i;
12937                 opt[i].dofo_strtab = DOF_SECIDX_NONE;
12938                 opt[i].dofo_value = state->dts_options[i];
12939         }
12940
12941         return (dof);
12942 }
12943
12944 static dof_hdr_t *
12945 dtrace_dof_copyin(uintptr_t uarg, int *errp)
12946 {
12947         dof_hdr_t hdr, *dof;
12948
12949         ASSERT(!MUTEX_HELD(&dtrace_lock));
12950
12951         /*
12952          * First, we're going to copyin() the sizeof (dof_hdr_t).
12953          */
12954         if (copyin((void *)uarg, &hdr, sizeof (hdr)) != 0) {
12955                 dtrace_dof_error(NULL, "failed to copyin DOF header");
12956                 *errp = EFAULT;
12957                 return (NULL);
12958         }
12959
12960         /*
12961          * Now we'll allocate the entire DOF and copy it in -- provided
12962          * that the length isn't outrageous.
12963          */
12964         if (hdr.dofh_loadsz >= dtrace_dof_maxsize) {
12965                 dtrace_dof_error(&hdr, "load size exceeds maximum");
12966                 *errp = E2BIG;
12967                 return (NULL);
12968         }
12969
12970         if (hdr.dofh_loadsz < sizeof (hdr)) {
12971                 dtrace_dof_error(&hdr, "invalid load size");
12972                 *errp = EINVAL;
12973                 return (NULL);
12974         }
12975
12976         dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
12977
12978         if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
12979             dof->dofh_loadsz != hdr.dofh_loadsz) {
12980                 kmem_free(dof, hdr.dofh_loadsz);
12981                 *errp = EFAULT;
12982                 return (NULL);
12983         }
12984
12985         return (dof);
12986 }
12987
12988 #if !defined(sun)
12989 static __inline uchar_t
12990 dtrace_dof_char(char c) {
12991         switch (c) {
12992         case '0':
12993         case '1':
12994         case '2':
12995         case '3':
12996         case '4':
12997         case '5':
12998         case '6':
12999         case '7':
13000         case '8':
13001         case '9':
13002                 return (c - '0');
13003         case 'A':
13004         case 'B':
13005         case 'C':
13006         case 'D':
13007         case 'E':
13008         case 'F':
13009                 return (c - 'A' + 10);
13010         case 'a':
13011         case 'b':
13012         case 'c':
13013         case 'd':
13014         case 'e':
13015         case 'f':
13016                 return (c - 'a' + 10);
13017         }
13018         /* Should not reach here. */
13019         return (0);
13020 }
13021 #endif
13022
13023 static dof_hdr_t *
13024 dtrace_dof_property(const char *name)
13025 {
13026         uchar_t *buf;
13027         uint64_t loadsz;
13028         unsigned int len, i;
13029         dof_hdr_t *dof;
13030
13031 #if defined(sun)
13032         /*
13033          * Unfortunately, array of values in .conf files are always (and
13034          * only) interpreted to be integer arrays.  We must read our DOF
13035          * as an integer array, and then squeeze it into a byte array.
13036          */
13037         if (ddi_prop_lookup_int_array(DDI_DEV_T_ANY, dtrace_devi, 0,
13038             (char *)name, (int **)&buf, &len) != DDI_PROP_SUCCESS)
13039                 return (NULL);
13040
13041         for (i = 0; i < len; i++)
13042                 buf[i] = (uchar_t)(((int *)buf)[i]);
13043
13044         if (len < sizeof (dof_hdr_t)) {
13045                 ddi_prop_free(buf);
13046                 dtrace_dof_error(NULL, "truncated header");
13047                 return (NULL);
13048         }
13049
13050         if (len < (loadsz = ((dof_hdr_t *)buf)->dofh_loadsz)) {
13051                 ddi_prop_free(buf);
13052                 dtrace_dof_error(NULL, "truncated DOF");
13053                 return (NULL);
13054         }
13055
13056         if (loadsz >= dtrace_dof_maxsize) {
13057                 ddi_prop_free(buf);
13058                 dtrace_dof_error(NULL, "oversized DOF");
13059                 return (NULL);
13060         }
13061
13062         dof = kmem_alloc(loadsz, KM_SLEEP);
13063         bcopy(buf, dof, loadsz);
13064         ddi_prop_free(buf);
13065 #else
13066         char *p;
13067         char *p_env;
13068
13069         if ((p_env = getenv(name)) == NULL)
13070                 return (NULL);
13071
13072         len = strlen(p_env) / 2;
13073
13074         buf = kmem_alloc(len, KM_SLEEP);
13075
13076         dof = (dof_hdr_t *) buf;
13077
13078         p = p_env;
13079
13080         for (i = 0; i < len; i++) {
13081                 buf[i] = (dtrace_dof_char(p[0]) << 4) |
13082                      dtrace_dof_char(p[1]);
13083                 p += 2;
13084         }
13085
13086         freeenv(p_env);
13087
13088         if (len < sizeof (dof_hdr_t)) {
13089                 kmem_free(buf, 0);
13090                 dtrace_dof_error(NULL, "truncated header");
13091                 return (NULL);
13092         }
13093
13094         if (len < (loadsz = dof->dofh_loadsz)) {
13095                 kmem_free(buf, 0);
13096                 dtrace_dof_error(NULL, "truncated DOF");
13097                 return (NULL);
13098         }
13099
13100         if (loadsz >= dtrace_dof_maxsize) {
13101                 kmem_free(buf, 0);
13102                 dtrace_dof_error(NULL, "oversized DOF");
13103                 return (NULL);
13104         }
13105 #endif
13106
13107         return (dof);
13108 }
13109
13110 static void
13111 dtrace_dof_destroy(dof_hdr_t *dof)
13112 {
13113         kmem_free(dof, dof->dofh_loadsz);
13114 }
13115
13116 /*
13117  * Return the dof_sec_t pointer corresponding to a given section index.  If the
13118  * index is not valid, dtrace_dof_error() is called and NULL is returned.  If
13119  * a type other than DOF_SECT_NONE is specified, the header is checked against
13120  * this type and NULL is returned if the types do not match.
13121  */
13122 static dof_sec_t *
13123 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13124 {
13125         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13126             ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13127
13128         if (i >= dof->dofh_secnum) {
13129                 dtrace_dof_error(dof, "referenced section index is invalid");
13130                 return (NULL);
13131         }
13132
13133         if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13134                 dtrace_dof_error(dof, "referenced section is not loadable");
13135                 return (NULL);
13136         }
13137
13138         if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13139                 dtrace_dof_error(dof, "referenced section is the wrong type");
13140                 return (NULL);
13141         }
13142
13143         return (sec);
13144 }
13145
13146 static dtrace_probedesc_t *
13147 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13148 {
13149         dof_probedesc_t *probe;
13150         dof_sec_t *strtab;
13151         uintptr_t daddr = (uintptr_t)dof;
13152         uintptr_t str;
13153         size_t size;
13154
13155         if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13156                 dtrace_dof_error(dof, "invalid probe section");
13157                 return (NULL);
13158         }
13159
13160         if (sec->dofs_align != sizeof (dof_secidx_t)) {
13161                 dtrace_dof_error(dof, "bad alignment in probe description");
13162                 return (NULL);
13163         }
13164
13165         if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13166                 dtrace_dof_error(dof, "truncated probe description");
13167                 return (NULL);
13168         }
13169
13170         probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13171         strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13172
13173         if (strtab == NULL)
13174                 return (NULL);
13175
13176         str = daddr + strtab->dofs_offset;
13177         size = strtab->dofs_size;
13178
13179         if (probe->dofp_provider >= strtab->dofs_size) {
13180                 dtrace_dof_error(dof, "corrupt probe provider");
13181                 return (NULL);
13182         }
13183
13184         (void) strncpy(desc->dtpd_provider,
13185             (char *)(str + probe->dofp_provider),
13186             MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13187
13188         if (probe->dofp_mod >= strtab->dofs_size) {
13189                 dtrace_dof_error(dof, "corrupt probe module");
13190                 return (NULL);
13191         }
13192
13193         (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13194             MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13195
13196         if (probe->dofp_func >= strtab->dofs_size) {
13197                 dtrace_dof_error(dof, "corrupt probe function");
13198                 return (NULL);
13199         }
13200
13201         (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13202             MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13203
13204         if (probe->dofp_name >= strtab->dofs_size) {
13205                 dtrace_dof_error(dof, "corrupt probe name");
13206                 return (NULL);
13207         }
13208
13209         (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13210             MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13211
13212         return (desc);
13213 }
13214
13215 static dtrace_difo_t *
13216 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13217     cred_t *cr)
13218 {
13219         dtrace_difo_t *dp;
13220         size_t ttl = 0;
13221         dof_difohdr_t *dofd;
13222         uintptr_t daddr = (uintptr_t)dof;
13223         size_t max = dtrace_difo_maxsize;
13224         int i, l, n;
13225
13226         static const struct {
13227                 int section;
13228                 int bufoffs;
13229                 int lenoffs;
13230                 int entsize;
13231                 int align;
13232                 const char *msg;
13233         } difo[] = {
13234                 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13235                 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13236                 sizeof (dif_instr_t), "multiple DIF sections" },
13237
13238                 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13239                 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13240                 sizeof (uint64_t), "multiple integer tables" },
13241
13242                 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13243                 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13244                 sizeof (char), "multiple string tables" },
13245
13246                 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13247                 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13248                 sizeof (uint_t), "multiple variable tables" },
13249
13250                 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13251         };
13252
13253         if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13254                 dtrace_dof_error(dof, "invalid DIFO header section");
13255                 return (NULL);
13256         }
13257
13258         if (sec->dofs_align != sizeof (dof_secidx_t)) {
13259                 dtrace_dof_error(dof, "bad alignment in DIFO header");
13260                 return (NULL);
13261         }
13262
13263         if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13264             sec->dofs_size % sizeof (dof_secidx_t)) {
13265                 dtrace_dof_error(dof, "bad size in DIFO header");
13266                 return (NULL);
13267         }
13268
13269         dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13270         n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13271
13272         dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13273         dp->dtdo_rtype = dofd->dofd_rtype;
13274
13275         for (l = 0; l < n; l++) {
13276                 dof_sec_t *subsec;
13277                 void **bufp;
13278                 uint32_t *lenp;
13279
13280                 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13281                     dofd->dofd_links[l])) == NULL)
13282                         goto err; /* invalid section link */
13283
13284                 if (ttl + subsec->dofs_size > max) {
13285                         dtrace_dof_error(dof, "exceeds maximum size");
13286                         goto err;
13287                 }
13288
13289                 ttl += subsec->dofs_size;
13290
13291                 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13292                         if (subsec->dofs_type != difo[i].section)
13293                                 continue;
13294
13295                         if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13296                                 dtrace_dof_error(dof, "section not loaded");
13297                                 goto err;
13298                         }
13299
13300                         if (subsec->dofs_align != difo[i].align) {
13301                                 dtrace_dof_error(dof, "bad alignment");
13302                                 goto err;
13303                         }
13304
13305                         bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13306                         lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13307
13308                         if (*bufp != NULL) {
13309                                 dtrace_dof_error(dof, difo[i].msg);
13310                                 goto err;
13311                         }
13312
13313                         if (difo[i].entsize != subsec->dofs_entsize) {
13314                                 dtrace_dof_error(dof, "entry size mismatch");
13315                                 goto err;
13316                         }
13317
13318                         if (subsec->dofs_entsize != 0 &&
13319                             (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13320                                 dtrace_dof_error(dof, "corrupt entry size");
13321                                 goto err;
13322                         }
13323
13324                         *lenp = subsec->dofs_size;
13325                         *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13326                         bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13327                             *bufp, subsec->dofs_size);
13328
13329                         if (subsec->dofs_entsize != 0)
13330                                 *lenp /= subsec->dofs_entsize;
13331
13332                         break;
13333                 }
13334
13335                 /*
13336                  * If we encounter a loadable DIFO sub-section that is not
13337                  * known to us, assume this is a broken program and fail.
13338                  */
13339                 if (difo[i].section == DOF_SECT_NONE &&
13340                     (subsec->dofs_flags & DOF_SECF_LOAD)) {
13341                         dtrace_dof_error(dof, "unrecognized DIFO subsection");
13342                         goto err;
13343                 }
13344         }
13345
13346         if (dp->dtdo_buf == NULL) {
13347                 /*
13348                  * We can't have a DIF object without DIF text.
13349                  */
13350                 dtrace_dof_error(dof, "missing DIF text");
13351                 goto err;
13352         }
13353
13354         /*
13355          * Before we validate the DIF object, run through the variable table
13356          * looking for the strings -- if any of their size are under, we'll set
13357          * their size to be the system-wide default string size.  Note that
13358          * this should _not_ happen if the "strsize" option has been set --
13359          * in this case, the compiler should have set the size to reflect the
13360          * setting of the option.
13361          */
13362         for (i = 0; i < dp->dtdo_varlen; i++) {
13363                 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13364                 dtrace_diftype_t *t = &v->dtdv_type;
13365
13366                 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13367                         continue;
13368
13369                 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13370                         t->dtdt_size = dtrace_strsize_default;
13371         }
13372
13373         if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13374                 goto err;
13375
13376         dtrace_difo_init(dp, vstate);
13377         return (dp);
13378
13379 err:
13380         kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13381         kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13382         kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13383         kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13384
13385         kmem_free(dp, sizeof (dtrace_difo_t));
13386         return (NULL);
13387 }
13388
13389 static dtrace_predicate_t *
13390 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13391     cred_t *cr)
13392 {
13393         dtrace_difo_t *dp;
13394
13395         if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13396                 return (NULL);
13397
13398         return (dtrace_predicate_create(dp));
13399 }
13400
13401 static dtrace_actdesc_t *
13402 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13403     cred_t *cr)
13404 {
13405         dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13406         dof_actdesc_t *desc;
13407         dof_sec_t *difosec;
13408         size_t offs;
13409         uintptr_t daddr = (uintptr_t)dof;
13410         uint64_t arg;
13411         dtrace_actkind_t kind;
13412
13413         if (sec->dofs_type != DOF_SECT_ACTDESC) {
13414                 dtrace_dof_error(dof, "invalid action section");
13415                 return (NULL);
13416         }
13417
13418         if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13419                 dtrace_dof_error(dof, "truncated action description");
13420                 return (NULL);
13421         }
13422
13423         if (sec->dofs_align != sizeof (uint64_t)) {
13424                 dtrace_dof_error(dof, "bad alignment in action description");
13425                 return (NULL);
13426         }
13427
13428         if (sec->dofs_size < sec->dofs_entsize) {
13429                 dtrace_dof_error(dof, "section entry size exceeds total size");
13430                 return (NULL);
13431         }
13432
13433         if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13434                 dtrace_dof_error(dof, "bad entry size in action description");
13435                 return (NULL);
13436         }
13437
13438         if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13439                 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13440                 return (NULL);
13441         }
13442
13443         for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13444                 desc = (dof_actdesc_t *)(daddr +
13445                     (uintptr_t)sec->dofs_offset + offs);
13446                 kind = (dtrace_actkind_t)desc->dofa_kind;
13447
13448                 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13449                     (kind != DTRACEACT_PRINTA ||
13450                     desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13451                     (kind == DTRACEACT_DIFEXPR &&
13452                     desc->dofa_strtab != DOF_SECIDX_NONE)) {
13453                         dof_sec_t *strtab;
13454                         char *str, *fmt;
13455                         uint64_t i;
13456
13457                         /*
13458                          * The argument to these actions is an index into the
13459                          * DOF string table.  For printf()-like actions, this
13460                          * is the format string.  For print(), this is the
13461                          * CTF type of the expression result.
13462                          */
13463                         if ((strtab = dtrace_dof_sect(dof,
13464                             DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13465                                 goto err;
13466
13467                         str = (char *)((uintptr_t)dof +
13468                             (uintptr_t)strtab->dofs_offset);
13469
13470                         for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13471                                 if (str[i] == '\0')
13472                                         break;
13473                         }
13474
13475                         if (i >= strtab->dofs_size) {
13476                                 dtrace_dof_error(dof, "bogus format string");
13477                                 goto err;
13478                         }
13479
13480                         if (i == desc->dofa_arg) {
13481                                 dtrace_dof_error(dof, "empty format string");
13482                                 goto err;
13483                         }
13484
13485                         i -= desc->dofa_arg;
13486                         fmt = kmem_alloc(i + 1, KM_SLEEP);
13487                         bcopy(&str[desc->dofa_arg], fmt, i + 1);
13488                         arg = (uint64_t)(uintptr_t)fmt;
13489                 } else {
13490                         if (kind == DTRACEACT_PRINTA) {
13491                                 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13492                                 arg = 0;
13493                         } else {
13494                                 arg = desc->dofa_arg;
13495                         }
13496                 }
13497
13498                 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13499                     desc->dofa_uarg, arg);
13500
13501                 if (last != NULL) {
13502                         last->dtad_next = act;
13503                 } else {
13504                         first = act;
13505                 }
13506
13507                 last = act;
13508
13509                 if (desc->dofa_difo == DOF_SECIDX_NONE)
13510                         continue;
13511
13512                 if ((difosec = dtrace_dof_sect(dof,
13513                     DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13514                         goto err;
13515
13516                 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13517
13518                 if (act->dtad_difo == NULL)
13519                         goto err;
13520         }
13521
13522         ASSERT(first != NULL);
13523         return (first);
13524
13525 err:
13526         for (act = first; act != NULL; act = next) {
13527                 next = act->dtad_next;
13528                 dtrace_actdesc_release(act, vstate);
13529         }
13530
13531         return (NULL);
13532 }
13533
13534 static dtrace_ecbdesc_t *
13535 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13536     cred_t *cr)
13537 {
13538         dtrace_ecbdesc_t *ep;
13539         dof_ecbdesc_t *ecb;
13540         dtrace_probedesc_t *desc;
13541         dtrace_predicate_t *pred = NULL;
13542
13543         if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13544                 dtrace_dof_error(dof, "truncated ECB description");
13545                 return (NULL);
13546         }
13547
13548         if (sec->dofs_align != sizeof (uint64_t)) {
13549                 dtrace_dof_error(dof, "bad alignment in ECB description");
13550                 return (NULL);
13551         }
13552
13553         ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13554         sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13555
13556         if (sec == NULL)
13557                 return (NULL);
13558
13559         ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13560         ep->dted_uarg = ecb->dofe_uarg;
13561         desc = &ep->dted_probe;
13562
13563         if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13564                 goto err;
13565
13566         if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13567                 if ((sec = dtrace_dof_sect(dof,
13568                     DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13569                         goto err;
13570
13571                 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13572                         goto err;
13573
13574                 ep->dted_pred.dtpdd_predicate = pred;
13575         }
13576
13577         if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13578                 if ((sec = dtrace_dof_sect(dof,
13579                     DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13580                         goto err;
13581
13582                 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13583
13584                 if (ep->dted_action == NULL)
13585                         goto err;
13586         }
13587
13588         return (ep);
13589
13590 err:
13591         if (pred != NULL)
13592                 dtrace_predicate_release(pred, vstate);
13593         kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13594         return (NULL);
13595 }
13596
13597 /*
13598  * Apply the relocations from the specified 'sec' (a DOF_SECT_URELHDR) to the
13599  * specified DOF.  At present, this amounts to simply adding 'ubase' to the
13600  * site of any user SETX relocations to account for load object base address.
13601  * In the future, if we need other relocations, this function can be extended.
13602  */
13603 static int
13604 dtrace_dof_relocate(dof_hdr_t *dof, dof_sec_t *sec, uint64_t ubase)
13605 {
13606         uintptr_t daddr = (uintptr_t)dof;
13607         dof_relohdr_t *dofr =
13608             (dof_relohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13609         dof_sec_t *ss, *rs, *ts;
13610         dof_relodesc_t *r;
13611         uint_t i, n;
13612
13613         if (sec->dofs_size < sizeof (dof_relohdr_t) ||
13614             sec->dofs_align != sizeof (dof_secidx_t)) {
13615                 dtrace_dof_error(dof, "invalid relocation header");
13616                 return (-1);
13617         }
13618
13619         ss = dtrace_dof_sect(dof, DOF_SECT_STRTAB, dofr->dofr_strtab);
13620         rs = dtrace_dof_sect(dof, DOF_SECT_RELTAB, dofr->dofr_relsec);
13621         ts = dtrace_dof_sect(dof, DOF_SECT_NONE, dofr->dofr_tgtsec);
13622
13623         if (ss == NULL || rs == NULL || ts == NULL)
13624                 return (-1); /* dtrace_dof_error() has been called already */
13625
13626         if (rs->dofs_entsize < sizeof (dof_relodesc_t) ||
13627             rs->dofs_align != sizeof (uint64_t)) {
13628                 dtrace_dof_error(dof, "invalid relocation section");
13629                 return (-1);
13630         }
13631
13632         r = (dof_relodesc_t *)(uintptr_t)(daddr + rs->dofs_offset);
13633         n = rs->dofs_size / rs->dofs_entsize;
13634
13635         for (i = 0; i < n; i++) {
13636                 uintptr_t taddr = daddr + ts->dofs_offset + r->dofr_offset;
13637
13638                 switch (r->dofr_type) {
13639                 case DOF_RELO_NONE:
13640                         break;
13641                 case DOF_RELO_SETX:
13642                         if (r->dofr_offset >= ts->dofs_size || r->dofr_offset +
13643                             sizeof (uint64_t) > ts->dofs_size) {
13644                                 dtrace_dof_error(dof, "bad relocation offset");
13645                                 return (-1);
13646                         }
13647
13648                         if (!IS_P2ALIGNED(taddr, sizeof (uint64_t))) {
13649                                 dtrace_dof_error(dof, "misaligned setx relo");
13650                                 return (-1);
13651                         }
13652
13653                         *(uint64_t *)taddr += ubase;
13654                         break;
13655                 default:
13656                         dtrace_dof_error(dof, "invalid relocation type");
13657                         return (-1);
13658                 }
13659
13660                 r = (dof_relodesc_t *)((uintptr_t)r + rs->dofs_entsize);
13661         }
13662
13663         return (0);
13664 }
13665
13666 /*
13667  * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13668  * header:  it should be at the front of a memory region that is at least
13669  * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13670  * size.  It need not be validated in any other way.
13671  */
13672 static int
13673 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13674     dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13675 {
13676         uint64_t len = dof->dofh_loadsz, seclen;
13677         uintptr_t daddr = (uintptr_t)dof;
13678         dtrace_ecbdesc_t *ep;
13679         dtrace_enabling_t *enab;
13680         uint_t i;
13681
13682         ASSERT(MUTEX_HELD(&dtrace_lock));
13683         ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13684
13685         /*
13686          * Check the DOF header identification bytes.  In addition to checking
13687          * valid settings, we also verify that unused bits/bytes are zeroed so
13688          * we can use them later without fear of regressing existing binaries.
13689          */
13690         if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13691             DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13692                 dtrace_dof_error(dof, "DOF magic string mismatch");
13693                 return (-1);
13694         }
13695
13696         if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13697             dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13698                 dtrace_dof_error(dof, "DOF has invalid data model");
13699                 return (-1);
13700         }
13701
13702         if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13703                 dtrace_dof_error(dof, "DOF encoding mismatch");
13704                 return (-1);
13705         }
13706
13707         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
13708             dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_2) {
13709                 dtrace_dof_error(dof, "DOF version mismatch");
13710                 return (-1);
13711         }
13712
13713         if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13714                 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13715                 return (-1);
13716         }
13717
13718         if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13719                 dtrace_dof_error(dof, "DOF uses too many integer registers");
13720                 return (-1);
13721         }
13722
13723         if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13724                 dtrace_dof_error(dof, "DOF uses too many tuple registers");
13725                 return (-1);
13726         }
13727
13728         for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13729                 if (dof->dofh_ident[i] != 0) {
13730                         dtrace_dof_error(dof, "DOF has invalid ident byte set");
13731                         return (-1);
13732                 }
13733         }
13734
13735         if (dof->dofh_flags & ~DOF_FL_VALID) {
13736                 dtrace_dof_error(dof, "DOF has invalid flag bits set");
13737                 return (-1);
13738         }
13739
13740         if (dof->dofh_secsize == 0) {
13741                 dtrace_dof_error(dof, "zero section header size");
13742                 return (-1);
13743         }
13744
13745         /*
13746          * Check that the section headers don't exceed the amount of DOF
13747          * data.  Note that we cast the section size and number of sections
13748          * to uint64_t's to prevent possible overflow in the multiplication.
13749          */
13750         seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13751
13752         if (dof->dofh_secoff > len || seclen > len ||
13753             dof->dofh_secoff + seclen > len) {
13754                 dtrace_dof_error(dof, "truncated section headers");
13755                 return (-1);
13756         }
13757
13758         if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13759                 dtrace_dof_error(dof, "misaligned section headers");
13760                 return (-1);
13761         }
13762
13763         if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13764                 dtrace_dof_error(dof, "misaligned section size");
13765                 return (-1);
13766         }
13767
13768         /*
13769          * Take an initial pass through the section headers to be sure that
13770          * the headers don't have stray offsets.  If the 'noprobes' flag is
13771          * set, do not permit sections relating to providers, probes, or args.
13772          */
13773         for (i = 0; i < dof->dofh_secnum; i++) {
13774                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13775                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13776
13777                 if (noprobes) {
13778                         switch (sec->dofs_type) {
13779                         case DOF_SECT_PROVIDER:
13780                         case DOF_SECT_PROBES:
13781                         case DOF_SECT_PRARGS:
13782                         case DOF_SECT_PROFFS:
13783                                 dtrace_dof_error(dof, "illegal sections "
13784                                     "for enabling");
13785                                 return (-1);
13786                         }
13787                 }
13788
13789                 if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
13790                     !(sec->dofs_flags & DOF_SECF_LOAD)) {
13791                         dtrace_dof_error(dof, "loadable section with load "
13792                             "flag unset");
13793                         return (-1);
13794                 }
13795
13796                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13797                         continue; /* just ignore non-loadable sections */
13798
13799                 if (!ISP2(sec->dofs_align)) {
13800                         dtrace_dof_error(dof, "bad section alignment");
13801                         return (-1);
13802                 }
13803
13804                 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13805                         dtrace_dof_error(dof, "misaligned section");
13806                         return (-1);
13807                 }
13808
13809                 if (sec->dofs_offset > len || sec->dofs_size > len ||
13810                     sec->dofs_offset + sec->dofs_size > len) {
13811                         dtrace_dof_error(dof, "corrupt section header");
13812                         return (-1);
13813                 }
13814
13815                 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13816                     sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13817                         dtrace_dof_error(dof, "non-terminating string table");
13818                         return (-1);
13819                 }
13820         }
13821
13822         /*
13823          * Take a second pass through the sections and locate and perform any
13824          * relocations that are present.  We do this after the first pass to
13825          * be sure that all sections have had their headers validated.
13826          */
13827         for (i = 0; i < dof->dofh_secnum; i++) {
13828                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13829                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13830
13831                 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13832                         continue; /* skip sections that are not loadable */
13833
13834                 switch (sec->dofs_type) {
13835                 case DOF_SECT_URELHDR:
13836                         if (dtrace_dof_relocate(dof, sec, ubase) != 0)
13837                                 return (-1);
13838                         break;
13839                 }
13840         }
13841
13842         if ((enab = *enabp) == NULL)
13843                 enab = *enabp = dtrace_enabling_create(vstate);
13844
13845         for (i = 0; i < dof->dofh_secnum; i++) {
13846                 dof_sec_t *sec = (dof_sec_t *)(daddr +
13847                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13848
13849                 if (sec->dofs_type != DOF_SECT_ECBDESC)
13850                         continue;
13851
13852                 if ((ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr)) == NULL) {
13853                         dtrace_enabling_destroy(enab);
13854                         *enabp = NULL;
13855                         return (-1);
13856                 }
13857
13858                 dtrace_enabling_add(enab, ep);
13859         }
13860
13861         return (0);
13862 }
13863
13864 /*
13865  * Process DOF for any options.  This routine assumes that the DOF has been
13866  * at least processed by dtrace_dof_slurp().
13867  */
13868 static int
13869 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13870 {
13871         int i, rval;
13872         uint32_t entsize;
13873         size_t offs;
13874         dof_optdesc_t *desc;
13875
13876         for (i = 0; i < dof->dofh_secnum; i++) {
13877                 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13878                     (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13879
13880                 if (sec->dofs_type != DOF_SECT_OPTDESC)
13881                         continue;
13882
13883                 if (sec->dofs_align != sizeof (uint64_t)) {
13884                         dtrace_dof_error(dof, "bad alignment in "
13885                             "option description");
13886                         return (EINVAL);
13887                 }
13888
13889                 if ((entsize = sec->dofs_entsize) == 0) {
13890                         dtrace_dof_error(dof, "zeroed option entry size");
13891                         return (EINVAL);
13892                 }
13893
13894                 if (entsize < sizeof (dof_optdesc_t)) {
13895                         dtrace_dof_error(dof, "bad option entry size");
13896                         return (EINVAL);
13897                 }
13898
13899                 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13900                         desc = (dof_optdesc_t *)((uintptr_t)dof +
13901                             (uintptr_t)sec->dofs_offset + offs);
13902
13903                         if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13904                                 dtrace_dof_error(dof, "non-zero option string");
13905                                 return (EINVAL);
13906                         }
13907
13908                         if (desc->dofo_value == DTRACEOPT_UNSET) {
13909                                 dtrace_dof_error(dof, "unset option");
13910                                 return (EINVAL);
13911                         }
13912
13913                         if ((rval = dtrace_state_option(state,
13914                             desc->dofo_option, desc->dofo_value)) != 0) {
13915                                 dtrace_dof_error(dof, "rejected option");
13916                                 return (rval);
13917                         }
13918                 }
13919         }
13920
13921         return (0);
13922 }
13923
13924 /*
13925  * DTrace Consumer State Functions
13926  */
13927 static int
13928 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13929 {
13930         size_t hashsize, maxper, min, chunksize = dstate->dtds_chunksize;
13931         void *base;
13932         uintptr_t limit;
13933         dtrace_dynvar_t *dvar, *next, *start;
13934         int i;
13935
13936         ASSERT(MUTEX_HELD(&dtrace_lock));
13937         ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13938
13939         bzero(dstate, sizeof (dtrace_dstate_t));
13940
13941         if ((dstate->dtds_chunksize = chunksize) == 0)
13942                 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13943
13944         if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13945                 size = min;
13946
13947         if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
13948                 return (ENOMEM);
13949
13950         dstate->dtds_size = size;
13951         dstate->dtds_base = base;
13952         dstate->dtds_percpu = kmem_cache_alloc(dtrace_state_cache, KM_SLEEP);
13953         bzero(dstate->dtds_percpu, NCPU * sizeof (dtrace_dstate_percpu_t));
13954
13955         hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13956
13957         if (hashsize != 1 && (hashsize & 1))
13958                 hashsize--;
13959
13960         dstate->dtds_hashsize = hashsize;
13961         dstate->dtds_hash = dstate->dtds_base;
13962
13963         /*
13964          * Set all of our hash buckets to point to the single sink, and (if
13965          * it hasn't already been set), set the sink's hash value to be the
13966          * sink sentinel value.  The sink is needed for dynamic variable
13967          * lookups to know that they have iterated over an entire, valid hash
13968          * chain.
13969          */
13970         for (i = 0; i < hashsize; i++)
13971                 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13972
13973         if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13974                 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13975
13976         /*
13977          * Determine number of active CPUs.  Divide free list evenly among
13978          * active CPUs.
13979          */
13980         start = (dtrace_dynvar_t *)
13981             ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13982         limit = (uintptr_t)base + size;
13983
13984         maxper = (limit - (uintptr_t)start) / NCPU;
13985         maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13986
13987 #if !defined(sun)
13988         CPU_FOREACH(i) {
13989 #else
13990         for (i = 0; i < NCPU; i++) {
13991 #endif
13992                 dstate->dtds_percpu[i].dtdsc_free = dvar = start;
13993
13994                 /*
13995                  * If we don't even have enough chunks to make it once through
13996                  * NCPUs, we're just going to allocate everything to the first
13997                  * CPU.  And if we're on the last CPU, we're going to allocate
13998                  * whatever is left over.  In either case, we set the limit to
13999                  * be the limit of the dynamic variable space.
14000                  */
14001                 if (maxper == 0 || i == NCPU - 1) {
14002                         limit = (uintptr_t)base + size;
14003                         start = NULL;
14004                 } else {
14005                         limit = (uintptr_t)start + maxper;
14006                         start = (dtrace_dynvar_t *)limit;
14007                 }
14008
14009                 ASSERT(limit <= (uintptr_t)base + size);
14010
14011                 for (;;) {
14012                         next = (dtrace_dynvar_t *)((uintptr_t)dvar +
14013                             dstate->dtds_chunksize);
14014
14015                         if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
14016                                 break;
14017
14018                         dvar->dtdv_next = next;
14019                         dvar = next;
14020                 }
14021
14022                 if (maxper == 0)
14023                         break;
14024         }
14025
14026         return (0);
14027 }
14028
14029 static void
14030 dtrace_dstate_fini(dtrace_dstate_t *dstate)
14031 {
14032         ASSERT(MUTEX_HELD(&cpu_lock));
14033
14034         if (dstate->dtds_base == NULL)
14035                 return;
14036
14037         kmem_free(dstate->dtds_base, dstate->dtds_size);
14038         kmem_cache_free(dtrace_state_cache, dstate->dtds_percpu);
14039 }
14040
14041 static void
14042 dtrace_vstate_fini(dtrace_vstate_t *vstate)
14043 {
14044         /*
14045          * Logical XOR, where are you?
14046          */
14047         ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
14048
14049         if (vstate->dtvs_nglobals > 0) {
14050                 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14051                     sizeof (dtrace_statvar_t *));
14052         }
14053
14054         if (vstate->dtvs_ntlocals > 0) {
14055                 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14056                     sizeof (dtrace_difv_t));
14057         }
14058
14059         ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
14060
14061         if (vstate->dtvs_nlocals > 0) {
14062                 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14063                     sizeof (dtrace_statvar_t *));
14064         }
14065 }
14066
14067 #if defined(sun)
14068 static void
14069 dtrace_state_clean(dtrace_state_t *state)
14070 {
14071         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14072                 return;
14073
14074         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14075         dtrace_speculation_clean(state);
14076 }
14077
14078 static void
14079 dtrace_state_deadman(dtrace_state_t *state)
14080 {
14081         hrtime_t now;
14082
14083         dtrace_sync();
14084
14085         now = dtrace_gethrtime();
14086
14087         if (state != dtrace_anon.dta_state &&
14088             now - state->dts_laststatus >= dtrace_deadman_user)
14089                 return;
14090
14091         /*
14092          * We must be sure that dts_alive never appears to be less than the
14093          * value upon entry to dtrace_state_deadman(), and because we lack a
14094          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14095          * store INT64_MAX to it, followed by a memory barrier, followed by
14096          * the new value.  This assures that dts_alive never appears to be
14097          * less than its true value, regardless of the order in which the
14098          * stores to the underlying storage are issued.
14099          */
14100         state->dts_alive = INT64_MAX;
14101         dtrace_membar_producer();
14102         state->dts_alive = now;
14103 }
14104 #else
14105 static void
14106 dtrace_state_clean(void *arg)
14107 {
14108         dtrace_state_t *state = arg;
14109         dtrace_optval_t *opt = state->dts_options;
14110
14111         if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14112                 return;
14113
14114         dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14115         dtrace_speculation_clean(state);
14116
14117         callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14118             dtrace_state_clean, state);
14119 }
14120
14121 static void
14122 dtrace_state_deadman(void *arg)
14123 {
14124         dtrace_state_t *state = arg;
14125         hrtime_t now;
14126
14127         dtrace_sync();
14128
14129         dtrace_debug_output();
14130
14131         now = dtrace_gethrtime();
14132
14133         if (state != dtrace_anon.dta_state &&
14134             now - state->dts_laststatus >= dtrace_deadman_user)
14135                 return;
14136
14137         /*
14138          * We must be sure that dts_alive never appears to be less than the
14139          * value upon entry to dtrace_state_deadman(), and because we lack a
14140          * dtrace_cas64(), we cannot store to it atomically.  We thus instead
14141          * store INT64_MAX to it, followed by a memory barrier, followed by
14142          * the new value.  This assures that dts_alive never appears to be
14143          * less than its true value, regardless of the order in which the
14144          * stores to the underlying storage are issued.
14145          */
14146         state->dts_alive = INT64_MAX;
14147         dtrace_membar_producer();
14148         state->dts_alive = now;
14149
14150         callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14151             dtrace_state_deadman, state);
14152 }
14153 #endif
14154
14155 static dtrace_state_t *
14156 #if defined(sun)
14157 dtrace_state_create(dev_t *devp, cred_t *cr)
14158 #else
14159 dtrace_state_create(struct cdev *dev)
14160 #endif
14161 {
14162 #if defined(sun)
14163         minor_t minor;
14164         major_t major;
14165 #else
14166         cred_t *cr = NULL;
14167         int m = 0;
14168 #endif
14169         char c[30];
14170         dtrace_state_t *state;
14171         dtrace_optval_t *opt;
14172         int bufsize = NCPU * sizeof (dtrace_buffer_t), i;
14173
14174         ASSERT(MUTEX_HELD(&dtrace_lock));
14175         ASSERT(MUTEX_HELD(&cpu_lock));
14176
14177 #if defined(sun)
14178         minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
14179             VM_BESTFIT | VM_SLEEP);
14180
14181         if (ddi_soft_state_zalloc(dtrace_softstate, minor) != DDI_SUCCESS) {
14182                 vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
14183                 return (NULL);
14184         }
14185
14186         state = ddi_get_soft_state(dtrace_softstate, minor);
14187 #else
14188         if (dev != NULL) {
14189                 cr = dev->si_cred;
14190                 m = dev2unit(dev);
14191         }
14192
14193         /* Allocate memory for the state. */
14194         state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
14195 #endif
14196
14197         state->dts_epid = DTRACE_EPIDNONE + 1;
14198
14199         (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
14200 #if defined(sun)
14201         state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
14202             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14203
14204         if (devp != NULL) {
14205                 major = getemajor(*devp);
14206         } else {
14207                 major = ddi_driver_major(dtrace_devi);
14208         }
14209
14210         state->dts_dev = makedevice(major, minor);
14211
14212         if (devp != NULL)
14213                 *devp = state->dts_dev;
14214 #else
14215         state->dts_aggid_arena = new_unrhdr(1, INT_MAX, &dtrace_unr_mtx);
14216         state->dts_dev = dev;
14217 #endif
14218
14219         /*
14220          * We allocate NCPU buffers.  On the one hand, this can be quite
14221          * a bit of memory per instance (nearly 36K on a Starcat).  On the
14222          * other hand, it saves an additional memory reference in the probe
14223          * path.
14224          */
14225         state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14226         state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14227
14228 #if defined(sun)
14229         state->dts_cleaner = CYCLIC_NONE;
14230         state->dts_deadman = CYCLIC_NONE;
14231 #else
14232         callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
14233         callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
14234 #endif
14235         state->dts_vstate.dtvs_state = state;
14236
14237         for (i = 0; i < DTRACEOPT_MAX; i++)
14238                 state->dts_options[i] = DTRACEOPT_UNSET;
14239
14240         /*
14241          * Set the default options.
14242          */
14243         opt = state->dts_options;
14244         opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14245         opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14246         opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14247         opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14248         opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14249         opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14250         opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14251         opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14252         opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14253         opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14254         opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14255         opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14256         opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14257         opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14258
14259         state->dts_activity = DTRACE_ACTIVITY_INACTIVE;
14260
14261         /*
14262          * Depending on the user credentials, we set flag bits which alter probe
14263          * visibility or the amount of destructiveness allowed.  In the case of
14264          * actual anonymous tracing, or the possession of all privileges, all of
14265          * the normal checks are bypassed.
14266          */
14267         if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14268                 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14269                 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14270         } else {
14271                 /*
14272                  * Set up the credentials for this instantiation.  We take a
14273                  * hold on the credential to prevent it from disappearing on
14274                  * us; this in turn prevents the zone_t referenced by this
14275                  * credential from disappearing.  This means that we can
14276                  * examine the credential and the zone from probe context.
14277                  */
14278                 crhold(cr);
14279                 state->dts_cred.dcr_cred = cr;
14280
14281                 /*
14282                  * CRA_PROC means "we have *some* privilege for dtrace" and
14283                  * unlocks the use of variables like pid, zonename, etc.
14284                  */
14285                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14286                     PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14287                         state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14288                 }
14289
14290                 /*
14291                  * dtrace_user allows use of syscall and profile providers.
14292                  * If the user also has proc_owner and/or proc_zone, we
14293                  * extend the scope to include additional visibility and
14294                  * destructive power.
14295                  */
14296                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14297                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14298                                 state->dts_cred.dcr_visible |=
14299                                     DTRACE_CRV_ALLPROC;
14300
14301                                 state->dts_cred.dcr_action |=
14302                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14303                         }
14304
14305                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14306                                 state->dts_cred.dcr_visible |=
14307                                     DTRACE_CRV_ALLZONE;
14308
14309                                 state->dts_cred.dcr_action |=
14310                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14311                         }
14312
14313                         /*
14314                          * If we have all privs in whatever zone this is,
14315                          * we can do destructive things to processes which
14316                          * have altered credentials.
14317                          */
14318 #if defined(sun)
14319                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14320                             cr->cr_zone->zone_privset)) {
14321                                 state->dts_cred.dcr_action |=
14322                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14323                         }
14324 #endif
14325                 }
14326
14327                 /*
14328                  * Holding the dtrace_kernel privilege also implies that
14329                  * the user has the dtrace_user privilege from a visibility
14330                  * perspective.  But without further privileges, some
14331                  * destructive actions are not available.
14332                  */
14333                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14334                         /*
14335                          * Make all probes in all zones visible.  However,
14336                          * this doesn't mean that all actions become available
14337                          * to all zones.
14338                          */
14339                         state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14340                             DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14341
14342                         state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14343                             DTRACE_CRA_PROC;
14344                         /*
14345                          * Holding proc_owner means that destructive actions
14346                          * for *this* zone are allowed.
14347                          */
14348                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14349                                 state->dts_cred.dcr_action |=
14350                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14351
14352                         /*
14353                          * Holding proc_zone means that destructive actions
14354                          * for this user/group ID in all zones is allowed.
14355                          */
14356                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14357                                 state->dts_cred.dcr_action |=
14358                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14359
14360 #if defined(sun)
14361                         /*
14362                          * If we have all privs in whatever zone this is,
14363                          * we can do destructive things to processes which
14364                          * have altered credentials.
14365                          */
14366                         if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
14367                             cr->cr_zone->zone_privset)) {
14368                                 state->dts_cred.dcr_action |=
14369                                     DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14370                         }
14371 #endif
14372                 }
14373
14374                 /*
14375                  * Holding the dtrace_proc privilege gives control over fasttrap
14376                  * and pid providers.  We need to grant wider destructive
14377                  * privileges in the event that the user has proc_owner and/or
14378                  * proc_zone.
14379                  */
14380                 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14381                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14382                                 state->dts_cred.dcr_action |=
14383                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14384
14385                         if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14386                                 state->dts_cred.dcr_action |=
14387                                     DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14388                 }
14389         }
14390
14391         return (state);
14392 }
14393
14394 static int
14395 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14396 {
14397         dtrace_optval_t *opt = state->dts_options, size;
14398         processorid_t cpu = 0;;
14399         int flags = 0, rval, factor, divisor = 1;
14400
14401         ASSERT(MUTEX_HELD(&dtrace_lock));
14402         ASSERT(MUTEX_HELD(&cpu_lock));
14403         ASSERT(which < DTRACEOPT_MAX);
14404         ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14405             (state == dtrace_anon.dta_state &&
14406             state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14407
14408         if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14409                 return (0);
14410
14411         if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14412                 cpu = opt[DTRACEOPT_CPU];
14413
14414         if (which == DTRACEOPT_SPECSIZE)
14415                 flags |= DTRACEBUF_NOSWITCH;
14416
14417         if (which == DTRACEOPT_BUFSIZE) {
14418                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14419                         flags |= DTRACEBUF_RING;
14420
14421                 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14422                         flags |= DTRACEBUF_FILL;
14423
14424                 if (state != dtrace_anon.dta_state ||
14425                     state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14426                         flags |= DTRACEBUF_INACTIVE;
14427         }
14428
14429         for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
14430                 /*
14431                  * The size must be 8-byte aligned.  If the size is not 8-byte
14432                  * aligned, drop it down by the difference.
14433                  */
14434                 if (size & (sizeof (uint64_t) - 1))
14435                         size -= size & (sizeof (uint64_t) - 1);
14436
14437                 if (size < state->dts_reserve) {
14438                         /*
14439                          * Buffers always must be large enough to accommodate
14440                          * their prereserved space.  We return E2BIG instead
14441                          * of ENOMEM in this case to allow for user-level
14442                          * software to differentiate the cases.
14443                          */
14444                         return (E2BIG);
14445                 }
14446
14447                 rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
14448
14449                 if (rval != ENOMEM) {
14450                         opt[which] = size;
14451                         return (rval);
14452                 }
14453
14454                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14455                         return (rval);
14456
14457                 for (divisor = 2; divisor < factor; divisor <<= 1)
14458                         continue;
14459         }
14460
14461         return (ENOMEM);
14462 }
14463
14464 static int
14465 dtrace_state_buffers(dtrace_state_t *state)
14466 {
14467         dtrace_speculation_t *spec = state->dts_speculations;
14468         int rval, i;
14469
14470         if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14471             DTRACEOPT_BUFSIZE)) != 0)
14472                 return (rval);
14473
14474         if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14475             DTRACEOPT_AGGSIZE)) != 0)
14476                 return (rval);
14477
14478         for (i = 0; i < state->dts_nspeculations; i++) {
14479                 if ((rval = dtrace_state_buffer(state,
14480                     spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14481                         return (rval);
14482         }
14483
14484         return (0);
14485 }
14486
14487 static void
14488 dtrace_state_prereserve(dtrace_state_t *state)
14489 {
14490         dtrace_ecb_t *ecb;
14491         dtrace_probe_t *probe;
14492
14493         state->dts_reserve = 0;
14494
14495         if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14496                 return;
14497
14498         /*
14499          * If our buffer policy is a "fill" buffer policy, we need to set the
14500          * prereserved space to be the space required by the END probes.
14501          */
14502         probe = dtrace_probes[dtrace_probeid_end - 1];
14503         ASSERT(probe != NULL);
14504
14505         for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14506                 if (ecb->dte_state != state)
14507                         continue;
14508
14509                 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14510         }
14511 }
14512
14513 static int
14514 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14515 {
14516         dtrace_optval_t *opt = state->dts_options, sz, nspec;
14517         dtrace_speculation_t *spec;
14518         dtrace_buffer_t *buf;
14519 #if defined(sun)
14520         cyc_handler_t hdlr;
14521         cyc_time_t when;
14522 #endif
14523         int rval = 0, i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14524         dtrace_icookie_t cookie;
14525
14526         mutex_enter(&cpu_lock);
14527         mutex_enter(&dtrace_lock);
14528
14529         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14530                 rval = EBUSY;
14531                 goto out;
14532         }
14533
14534         /*
14535          * Before we can perform any checks, we must prime all of the
14536          * retained enablings that correspond to this state.
14537          */
14538         dtrace_enabling_prime(state);
14539
14540         if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14541                 rval = EACCES;
14542                 goto out;
14543         }
14544
14545         dtrace_state_prereserve(state);
14546
14547         /*
14548          * Now we want to do is try to allocate our speculations.
14549          * We do not automatically resize the number of speculations; if
14550          * this fails, we will fail the operation.
14551          */
14552         nspec = opt[DTRACEOPT_NSPEC];
14553         ASSERT(nspec != DTRACEOPT_UNSET);
14554
14555         if (nspec > INT_MAX) {
14556                 rval = ENOMEM;
14557                 goto out;
14558         }
14559
14560         spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
14561             KM_NOSLEEP | KM_NORMALPRI);
14562
14563         if (spec == NULL) {
14564                 rval = ENOMEM;
14565                 goto out;
14566         }
14567
14568         state->dts_speculations = spec;
14569         state->dts_nspeculations = (int)nspec;
14570
14571         for (i = 0; i < nspec; i++) {
14572                 if ((buf = kmem_zalloc(bufsize,
14573                     KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
14574                         rval = ENOMEM;
14575                         goto err;
14576                 }
14577
14578                 spec[i].dtsp_buffer = buf;
14579         }
14580
14581         if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14582                 if (dtrace_anon.dta_state == NULL) {
14583                         rval = ENOENT;
14584                         goto out;
14585                 }
14586
14587                 if (state->dts_necbs != 0) {
14588                         rval = EALREADY;
14589                         goto out;
14590                 }
14591
14592                 state->dts_anon = dtrace_anon_grab();
14593                 ASSERT(state->dts_anon != NULL);
14594                 state = state->dts_anon;
14595
14596                 /*
14597                  * We want "grabanon" to be set in the grabbed state, so we'll
14598                  * copy that option value from the grabbing state into the
14599                  * grabbed state.
14600                  */
14601                 state->dts_options[DTRACEOPT_GRABANON] =
14602                     opt[DTRACEOPT_GRABANON];
14603
14604                 *cpu = dtrace_anon.dta_beganon;
14605
14606                 /*
14607                  * If the anonymous state is active (as it almost certainly
14608                  * is if the anonymous enabling ultimately matched anything),
14609                  * we don't allow any further option processing -- but we
14610                  * don't return failure.
14611                  */
14612                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14613                         goto out;
14614         }
14615
14616         if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14617             opt[DTRACEOPT_AGGSIZE] != 0) {
14618                 if (state->dts_aggregations == NULL) {
14619                         /*
14620                          * We're not going to create an aggregation buffer
14621                          * because we don't have any ECBs that contain
14622                          * aggregations -- set this option to 0.
14623                          */
14624                         opt[DTRACEOPT_AGGSIZE] = 0;
14625                 } else {
14626                         /*
14627                          * If we have an aggregation buffer, we must also have
14628                          * a buffer to use as scratch.
14629                          */
14630                         if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14631                             opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14632                                 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14633                         }
14634                 }
14635         }
14636
14637         if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14638             opt[DTRACEOPT_SPECSIZE] != 0) {
14639                 if (!state->dts_speculates) {
14640                         /*
14641                          * We're not going to create speculation buffers
14642                          * because we don't have any ECBs that actually
14643                          * speculate -- set the speculation size to 0.
14644                          */
14645                         opt[DTRACEOPT_SPECSIZE] = 0;
14646                 }
14647         }
14648
14649         /*
14650          * The bare minimum size for any buffer that we're actually going to
14651          * do anything to is sizeof (uint64_t).
14652          */
14653         sz = sizeof (uint64_t);
14654
14655         if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14656             (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14657             (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14658                 /*
14659                  * A buffer size has been explicitly set to 0 (or to a size
14660                  * that will be adjusted to 0) and we need the space -- we
14661                  * need to return failure.  We return ENOSPC to differentiate
14662                  * it from failing to allocate a buffer due to failure to meet
14663                  * the reserve (for which we return E2BIG).
14664                  */
14665                 rval = ENOSPC;
14666                 goto out;
14667         }
14668
14669         if ((rval = dtrace_state_buffers(state)) != 0)
14670                 goto err;
14671
14672         if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14673                 sz = dtrace_dstate_defsize;
14674
14675         do {
14676                 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14677
14678                 if (rval == 0)
14679                         break;
14680
14681                 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14682                         goto err;
14683         } while (sz >>= 1);
14684
14685         opt[DTRACEOPT_DYNVARSIZE] = sz;
14686
14687         if (rval != 0)
14688                 goto err;
14689
14690         if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14691                 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14692
14693         if (opt[DTRACEOPT_CLEANRATE] == 0)
14694                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14695
14696         if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14697                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14698
14699         if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14700                 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14701
14702         state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14703 #if defined(sun)
14704         hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14705         hdlr.cyh_arg = state;
14706         hdlr.cyh_level = CY_LOW_LEVEL;
14707
14708         when.cyt_when = 0;
14709         when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14710
14711         state->dts_cleaner = cyclic_add(&hdlr, &when);
14712
14713         hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14714         hdlr.cyh_arg = state;
14715         hdlr.cyh_level = CY_LOW_LEVEL;
14716
14717         when.cyt_when = 0;
14718         when.cyt_interval = dtrace_deadman_interval;
14719
14720         state->dts_deadman = cyclic_add(&hdlr, &when);
14721 #else
14722         callout_reset(&state->dts_cleaner, hz * opt[DTRACEOPT_CLEANRATE] / NANOSEC,
14723             dtrace_state_clean, state);
14724         callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
14725             dtrace_state_deadman, state);
14726 #endif
14727
14728         state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14729
14730 #if defined(sun)
14731         if (state->dts_getf != 0 &&
14732             !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14733                 /*
14734                  * We don't have kernel privs but we have at least one call
14735                  * to getf(); we need to bump our zone's count, and (if
14736                  * this is the first enabling to have an unprivileged call
14737                  * to getf()) we need to hook into closef().
14738                  */
14739                 state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
14740
14741                 if (dtrace_getf++ == 0) {
14742                         ASSERT(dtrace_closef == NULL);
14743                         dtrace_closef = dtrace_getf_barrier;
14744                 }
14745         }
14746 #endif
14747
14748         /*
14749          * Now it's time to actually fire the BEGIN probe.  We need to disable
14750          * interrupts here both to record the CPU on which we fired the BEGIN
14751          * probe (the data from this CPU will be processed first at user
14752          * level) and to manually activate the buffer for this CPU.
14753          */
14754         cookie = dtrace_interrupt_disable();
14755         *cpu = curcpu;
14756         ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14757         state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14758
14759         dtrace_probe(dtrace_probeid_begin,
14760             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14761         dtrace_interrupt_enable(cookie);
14762         /*
14763          * We may have had an exit action from a BEGIN probe; only change our
14764          * state to ACTIVE if we're still in WARMUP.
14765          */
14766         ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14767             state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14768
14769         if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14770                 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14771
14772         /*
14773          * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14774          * want each CPU to transition its principal buffer out of the
14775          * INACTIVE state.  Doing this assures that no CPU will suddenly begin
14776          * processing an ECB halfway down a probe's ECB chain; all CPUs will
14777          * atomically transition from processing none of a state's ECBs to
14778          * processing all of them.
14779          */
14780         dtrace_xcall(DTRACE_CPUALL,
14781             (dtrace_xcall_t)dtrace_buffer_activate, state);
14782         goto out;
14783
14784 err:
14785         dtrace_buffer_free(state->dts_buffer);
14786         dtrace_buffer_free(state->dts_aggbuffer);
14787
14788         if ((nspec = state->dts_nspeculations) == 0) {
14789                 ASSERT(state->dts_speculations == NULL);
14790                 goto out;
14791         }
14792
14793         spec = state->dts_speculations;
14794         ASSERT(spec != NULL);
14795
14796         for (i = 0; i < state->dts_nspeculations; i++) {
14797                 if ((buf = spec[i].dtsp_buffer) == NULL)
14798                         break;
14799
14800                 dtrace_buffer_free(buf);
14801                 kmem_free(buf, bufsize);
14802         }
14803
14804         kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14805         state->dts_nspeculations = 0;
14806         state->dts_speculations = NULL;
14807
14808 out:
14809         mutex_exit(&dtrace_lock);
14810         mutex_exit(&cpu_lock);
14811
14812         return (rval);
14813 }
14814
14815 static int
14816 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14817 {
14818         dtrace_icookie_t cookie;
14819
14820         ASSERT(MUTEX_HELD(&dtrace_lock));
14821
14822         if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14823             state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14824                 return (EINVAL);
14825
14826         /*
14827          * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14828          * to be sure that every CPU has seen it.  See below for the details
14829          * on why this is done.
14830          */
14831         state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14832         dtrace_sync();
14833
14834         /*
14835          * By this point, it is impossible for any CPU to be still processing
14836          * with DTRACE_ACTIVITY_ACTIVE.  We can thus set our activity to
14837          * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14838          * other CPU in dtrace_buffer_reserve().  This allows dtrace_probe()
14839          * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14840          * iff we're in the END probe.
14841          */
14842         state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14843         dtrace_sync();
14844         ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14845
14846         /*
14847          * Finally, we can release the reserve and call the END probe.  We
14848          * disable interrupts across calling the END probe to allow us to
14849          * return the CPU on which we actually called the END probe.  This
14850          * allows user-land to be sure that this CPU's principal buffer is
14851          * processed last.
14852          */
14853         state->dts_reserve = 0;
14854
14855         cookie = dtrace_interrupt_disable();
14856         *cpu = curcpu;
14857         dtrace_probe(dtrace_probeid_end,
14858             (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14859         dtrace_interrupt_enable(cookie);
14860
14861         state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14862         dtrace_sync();
14863
14864 #if defined(sun)
14865         if (state->dts_getf != 0 &&
14866             !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
14867                 /*
14868                  * We don't have kernel privs but we have at least one call
14869                  * to getf(); we need to lower our zone's count, and (if
14870                  * this is the last enabling to have an unprivileged call
14871                  * to getf()) we need to clear the closef() hook.
14872                  */
14873                 ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
14874                 ASSERT(dtrace_closef == dtrace_getf_barrier);
14875                 ASSERT(dtrace_getf > 0);
14876
14877                 state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
14878
14879                 if (--dtrace_getf == 0)
14880                         dtrace_closef = NULL;
14881         }
14882 #endif
14883
14884         return (0);
14885 }
14886
14887 static int
14888 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14889     dtrace_optval_t val)
14890 {
14891         ASSERT(MUTEX_HELD(&dtrace_lock));
14892
14893         if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14894                 return (EBUSY);
14895
14896         if (option >= DTRACEOPT_MAX)
14897                 return (EINVAL);
14898
14899         if (option != DTRACEOPT_CPU && val < 0)
14900                 return (EINVAL);
14901
14902         switch (option) {
14903         case DTRACEOPT_DESTRUCTIVE:
14904                 if (dtrace_destructive_disallow)
14905                         return (EACCES);
14906
14907                 state->dts_cred.dcr_destructive = 1;
14908                 break;
14909
14910         case DTRACEOPT_BUFSIZE:
14911         case DTRACEOPT_DYNVARSIZE:
14912         case DTRACEOPT_AGGSIZE:
14913         case DTRACEOPT_SPECSIZE:
14914         case DTRACEOPT_STRSIZE:
14915                 if (val < 0)
14916                         return (EINVAL);
14917
14918                 if (val >= LONG_MAX) {
14919                         /*
14920                          * If this is an otherwise negative value, set it to
14921                          * the highest multiple of 128m less than LONG_MAX.
14922                          * Technically, we're adjusting the size without
14923                          * regard to the buffer resizing policy, but in fact,
14924                          * this has no effect -- if we set the buffer size to
14925                          * ~LONG_MAX and the buffer policy is ultimately set to
14926                          * be "manual", the buffer allocation is guaranteed to
14927                          * fail, if only because the allocation requires two
14928                          * buffers.  (We set the the size to the highest
14929                          * multiple of 128m because it ensures that the size
14930                          * will remain a multiple of a megabyte when
14931                          * repeatedly halved -- all the way down to 15m.)
14932                          */
14933                         val = LONG_MAX - (1 << 27) + 1;
14934                 }
14935         }
14936
14937         state->dts_options[option] = val;
14938
14939         return (0);
14940 }
14941
14942 static void
14943 dtrace_state_destroy(dtrace_state_t *state)
14944 {
14945         dtrace_ecb_t *ecb;
14946         dtrace_vstate_t *vstate = &state->dts_vstate;
14947 #if defined(sun)
14948         minor_t minor = getminor(state->dts_dev);
14949 #endif
14950         int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
14951         dtrace_speculation_t *spec = state->dts_speculations;
14952         int nspec = state->dts_nspeculations;
14953         uint32_t match;
14954
14955         ASSERT(MUTEX_HELD(&dtrace_lock));
14956         ASSERT(MUTEX_HELD(&cpu_lock));
14957
14958         /*
14959          * First, retract any retained enablings for this state.
14960          */
14961         dtrace_enabling_retract(state);
14962         ASSERT(state->dts_nretained == 0);
14963
14964         if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14965             state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14966                 /*
14967                  * We have managed to come into dtrace_state_destroy() on a
14968                  * hot enabling -- almost certainly because of a disorderly
14969                  * shutdown of a consumer.  (That is, a consumer that is
14970                  * exiting without having called dtrace_stop().) In this case,
14971                  * we're going to set our activity to be KILLED, and then
14972                  * issue a sync to be sure that everyone is out of probe
14973                  * context before we start blowing away ECBs.
14974                  */
14975                 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14976                 dtrace_sync();
14977         }
14978
14979         /*
14980          * Release the credential hold we took in dtrace_state_create().
14981          */
14982         if (state->dts_cred.dcr_cred != NULL)
14983                 crfree(state->dts_cred.dcr_cred);
14984
14985         /*
14986          * Now we can safely disable and destroy any enabled probes.  Because
14987          * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14988          * (especially if they're all enabled), we take two passes through the
14989          * ECBs:  in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14990          * in the second we disable whatever is left over.
14991          */
14992         for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14993                 for (i = 0; i < state->dts_necbs; i++) {
14994                         if ((ecb = state->dts_ecbs[i]) == NULL)
14995                                 continue;
14996
14997                         if (match && ecb->dte_probe != NULL) {
14998                                 dtrace_probe_t *probe = ecb->dte_probe;
14999                                 dtrace_provider_t *prov = probe->dtpr_provider;
15000
15001                                 if (!(prov->dtpv_priv.dtpp_flags & match))
15002                                         continue;
15003                         }
15004
15005                         dtrace_ecb_disable(ecb);
15006                         dtrace_ecb_destroy(ecb);
15007                 }
15008
15009                 if (!match)
15010                         break;
15011         }
15012
15013         /*
15014          * Before we free the buffers, perform one more sync to assure that
15015          * every CPU is out of probe context.
15016          */
15017         dtrace_sync();
15018
15019         dtrace_buffer_free(state->dts_buffer);
15020         dtrace_buffer_free(state->dts_aggbuffer);
15021
15022         for (i = 0; i < nspec; i++)
15023                 dtrace_buffer_free(spec[i].dtsp_buffer);
15024
15025 #if defined(sun)
15026         if (state->dts_cleaner != CYCLIC_NONE)
15027                 cyclic_remove(state->dts_cleaner);
15028
15029         if (state->dts_deadman != CYCLIC_NONE)
15030                 cyclic_remove(state->dts_deadman);
15031 #else
15032         callout_stop(&state->dts_cleaner);
15033         callout_drain(&state->dts_cleaner);
15034         callout_stop(&state->dts_deadman);
15035         callout_drain(&state->dts_deadman);
15036 #endif
15037
15038         dtrace_dstate_fini(&vstate->dtvs_dynvars);
15039         dtrace_vstate_fini(vstate);
15040         if (state->dts_ecbs != NULL)
15041                 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
15042
15043         if (state->dts_aggregations != NULL) {
15044 #ifdef DEBUG
15045                 for (i = 0; i < state->dts_naggregations; i++)
15046                         ASSERT(state->dts_aggregations[i] == NULL);
15047 #endif
15048                 ASSERT(state->dts_naggregations > 0);
15049                 kmem_free(state->dts_aggregations,
15050                     state->dts_naggregations * sizeof (dtrace_aggregation_t *));
15051         }
15052
15053         kmem_free(state->dts_buffer, bufsize);
15054         kmem_free(state->dts_aggbuffer, bufsize);
15055
15056         for (i = 0; i < nspec; i++)
15057                 kmem_free(spec[i].dtsp_buffer, bufsize);
15058
15059         if (spec != NULL)
15060                 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
15061
15062         dtrace_format_destroy(state);
15063
15064         if (state->dts_aggid_arena != NULL) {
15065 #if defined(sun)
15066                 vmem_destroy(state->dts_aggid_arena);
15067 #else
15068                 delete_unrhdr(state->dts_aggid_arena);
15069 #endif
15070                 state->dts_aggid_arena = NULL;
15071         }
15072 #if defined(sun)
15073         ddi_soft_state_free(dtrace_softstate, minor);
15074         vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
15075 #endif
15076 }
15077
15078 /*
15079  * DTrace Anonymous Enabling Functions
15080  */
15081 static dtrace_state_t *
15082 dtrace_anon_grab(void)
15083 {
15084         dtrace_state_t *state;
15085
15086         ASSERT(MUTEX_HELD(&dtrace_lock));
15087
15088         if ((state = dtrace_anon.dta_state) == NULL) {
15089                 ASSERT(dtrace_anon.dta_enabling == NULL);
15090                 return (NULL);
15091         }
15092
15093         ASSERT(dtrace_anon.dta_enabling != NULL);
15094         ASSERT(dtrace_retained != NULL);
15095
15096         dtrace_enabling_destroy(dtrace_anon.dta_enabling);
15097         dtrace_anon.dta_enabling = NULL;
15098         dtrace_anon.dta_state = NULL;
15099
15100         return (state);
15101 }
15102
15103 static void
15104 dtrace_anon_property(void)
15105 {
15106         int i, rv;
15107         dtrace_state_t *state;
15108         dof_hdr_t *dof;
15109         char c[32];             /* enough for "dof-data-" + digits */
15110
15111         ASSERT(MUTEX_HELD(&dtrace_lock));
15112         ASSERT(MUTEX_HELD(&cpu_lock));
15113
15114         for (i = 0; ; i++) {
15115                 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
15116
15117                 dtrace_err_verbose = 1;
15118
15119                 if ((dof = dtrace_dof_property(c)) == NULL) {
15120                         dtrace_err_verbose = 0;
15121                         break;
15122                 }
15123
15124 #if defined(sun)
15125                 /*
15126                  * We want to create anonymous state, so we need to transition
15127                  * the kernel debugger to indicate that DTrace is active.  If
15128                  * this fails (e.g. because the debugger has modified text in
15129                  * some way), we won't continue with the processing.
15130                  */
15131                 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15132                         cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15133                             "enabling ignored.");
15134                         dtrace_dof_destroy(dof);
15135                         break;
15136                 }
15137 #endif
15138
15139                 /*
15140                  * If we haven't allocated an anonymous state, we'll do so now.
15141                  */
15142                 if ((state = dtrace_anon.dta_state) == NULL) {
15143 #if defined(sun)
15144                         state = dtrace_state_create(NULL, NULL);
15145 #else
15146                         state = dtrace_state_create(NULL);
15147 #endif
15148                         dtrace_anon.dta_state = state;
15149
15150                         if (state == NULL) {
15151                                 /*
15152                                  * This basically shouldn't happen:  the only
15153                                  * failure mode from dtrace_state_create() is a
15154                                  * failure of ddi_soft_state_zalloc() that
15155                                  * itself should never happen.  Still, the
15156                                  * interface allows for a failure mode, and
15157                                  * we want to fail as gracefully as possible:
15158                                  * we'll emit an error message and cease
15159                                  * processing anonymous state in this case.
15160                                  */
15161                                 cmn_err(CE_WARN, "failed to create "
15162                                     "anonymous state");
15163                                 dtrace_dof_destroy(dof);
15164                                 break;
15165                         }
15166                 }
15167
15168                 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15169                     &dtrace_anon.dta_enabling, 0, B_TRUE);
15170
15171                 if (rv == 0)
15172                         rv = dtrace_dof_options(dof, state);
15173
15174                 dtrace_err_verbose = 0;
15175                 dtrace_dof_destroy(dof);
15176
15177                 if (rv != 0) {
15178                         /*
15179                          * This is malformed DOF; chuck any anonymous state
15180                          * that we created.
15181                          */
15182                         ASSERT(dtrace_anon.dta_enabling == NULL);
15183                         dtrace_state_destroy(state);
15184                         dtrace_anon.dta_state = NULL;
15185                         break;
15186                 }
15187
15188                 ASSERT(dtrace_anon.dta_enabling != NULL);
15189         }
15190
15191         if (dtrace_anon.dta_enabling != NULL) {
15192                 int rval;
15193
15194                 /*
15195                  * dtrace_enabling_retain() can only fail because we are
15196                  * trying to retain more enablings than are allowed -- but
15197                  * we only have one anonymous enabling, and we are guaranteed
15198                  * to be allowed at least one retained enabling; we assert
15199                  * that dtrace_enabling_retain() returns success.
15200                  */
15201                 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15202                 ASSERT(rval == 0);
15203
15204                 dtrace_enabling_dump(dtrace_anon.dta_enabling);
15205         }
15206 }
15207
15208 /*
15209  * DTrace Helper Functions
15210  */
15211 static void
15212 dtrace_helper_trace(dtrace_helper_action_t *helper,
15213     dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15214 {
15215         uint32_t size, next, nnext, i;
15216         dtrace_helptrace_t *ent, *buffer;
15217         uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
15218
15219         if ((buffer = dtrace_helptrace_buffer) == NULL)
15220                 return;
15221
15222         ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15223
15224         /*
15225          * What would a tracing framework be without its own tracing
15226          * framework?  (Well, a hell of a lot simpler, for starters...)
15227          */
15228         size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15229             sizeof (uint64_t) - sizeof (uint64_t);
15230
15231         /*
15232          * Iterate until we can allocate a slot in the trace buffer.
15233          */
15234         do {
15235                 next = dtrace_helptrace_next;
15236
15237                 if (next + size < dtrace_helptrace_bufsize) {
15238                         nnext = next + size;
15239                 } else {
15240                         nnext = size;
15241                 }
15242         } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15243
15244         /*
15245          * We have our slot; fill it in.
15246          */
15247         if (nnext == size) {
15248                 dtrace_helptrace_wrapped++;
15249                 next = 0;
15250         }
15251
15252         ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);
15253         ent->dtht_helper = helper;
15254         ent->dtht_where = where;
15255         ent->dtht_nlocals = vstate->dtvs_nlocals;
15256
15257         ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15258             mstate->dtms_fltoffs : -1;
15259         ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15260         ent->dtht_illval = cpu_core[curcpu].cpuc_dtrace_illval;
15261
15262         for (i = 0; i < vstate->dtvs_nlocals; i++) {
15263                 dtrace_statvar_t *svar;
15264
15265                 if ((svar = vstate->dtvs_locals[i]) == NULL)
15266                         continue;
15267
15268                 ASSERT(svar->dtsv_size >= NCPU * sizeof (uint64_t));
15269                 ent->dtht_locals[i] =
15270                     ((uint64_t *)(uintptr_t)svar->dtsv_data)[curcpu];
15271         }
15272 }
15273
15274 static uint64_t
15275 dtrace_helper(int which, dtrace_mstate_t *mstate,
15276     dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15277 {
15278         uint16_t *flags = &cpu_core[curcpu].cpuc_dtrace_flags;
15279         uint64_t sarg0 = mstate->dtms_arg[0];
15280         uint64_t sarg1 = mstate->dtms_arg[1];
15281         uint64_t rval = 0;
15282         dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15283         dtrace_helper_action_t *helper;
15284         dtrace_vstate_t *vstate;
15285         dtrace_difo_t *pred;
15286         int i, trace = dtrace_helptrace_buffer != NULL;
15287
15288         ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15289
15290         if (helpers == NULL)
15291                 return (0);
15292
15293         if ((helper = helpers->dthps_actions[which]) == NULL)
15294                 return (0);
15295
15296         vstate = &helpers->dthps_vstate;
15297         mstate->dtms_arg[0] = arg0;
15298         mstate->dtms_arg[1] = arg1;
15299
15300         /*
15301          * Now iterate over each helper.  If its predicate evaluates to 'true',
15302          * we'll call the corresponding actions.  Note that the below calls
15303          * to dtrace_dif_emulate() may set faults in machine state.  This is
15304          * okay:  our caller (the outer dtrace_dif_emulate()) will simply plow
15305          * the stored DIF offset with its own (which is the desired behavior).
15306          * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15307          * from machine state; this is okay, too.
15308          */
15309         for (; helper != NULL; helper = helper->dtha_next) {
15310                 if ((pred = helper->dtha_predicate) != NULL) {
15311                         if (trace)
15312                                 dtrace_helper_trace(helper, mstate, vstate, 0);
15313
15314                         if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15315                                 goto next;
15316
15317                         if (*flags & CPU_DTRACE_FAULT)
15318                                 goto err;
15319                 }
15320
15321                 for (i = 0; i < helper->dtha_nactions; i++) {
15322                         if (trace)
15323                                 dtrace_helper_trace(helper,
15324                                     mstate, vstate, i + 1);
15325
15326                         rval = dtrace_dif_emulate(helper->dtha_actions[i],
15327                             mstate, vstate, state);
15328
15329                         if (*flags & CPU_DTRACE_FAULT)
15330                                 goto err;
15331                 }
15332
15333 next:
15334                 if (trace)
15335                         dtrace_helper_trace(helper, mstate, vstate,
15336                             DTRACE_HELPTRACE_NEXT);
15337         }
15338
15339         if (trace)
15340                 dtrace_helper_trace(helper, mstate, vstate,
15341                     DTRACE_HELPTRACE_DONE);
15342
15343         /*
15344          * Restore the arg0 that we saved upon entry.
15345          */
15346         mstate->dtms_arg[0] = sarg0;
15347         mstate->dtms_arg[1] = sarg1;
15348
15349         return (rval);
15350
15351 err:
15352         if (trace)
15353                 dtrace_helper_trace(helper, mstate, vstate,
15354                     DTRACE_HELPTRACE_ERR);
15355
15356         /*
15357          * Restore the arg0 that we saved upon entry.
15358          */
15359         mstate->dtms_arg[0] = sarg0;
15360         mstate->dtms_arg[1] = sarg1;
15361
15362         return (0);
15363 }
15364
15365 static void
15366 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15367     dtrace_vstate_t *vstate)
15368 {
15369         int i;
15370
15371         if (helper->dtha_predicate != NULL)
15372                 dtrace_difo_release(helper->dtha_predicate, vstate);
15373
15374         for (i = 0; i < helper->dtha_nactions; i++) {
15375                 ASSERT(helper->dtha_actions[i] != NULL);
15376                 dtrace_difo_release(helper->dtha_actions[i], vstate);
15377         }
15378
15379         kmem_free(helper->dtha_actions,
15380             helper->dtha_nactions * sizeof (dtrace_difo_t *));
15381         kmem_free(helper, sizeof (dtrace_helper_action_t));
15382 }
15383
15384 static int
15385 dtrace_helper_destroygen(int gen)
15386 {
15387         proc_t *p = curproc;
15388         dtrace_helpers_t *help = p->p_dtrace_helpers;
15389         dtrace_vstate_t *vstate;
15390         int i;
15391
15392         ASSERT(MUTEX_HELD(&dtrace_lock));
15393
15394         if (help == NULL || gen > help->dthps_generation)
15395                 return (EINVAL);
15396
15397         vstate = &help->dthps_vstate;
15398
15399         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15400                 dtrace_helper_action_t *last = NULL, *h, *next;
15401
15402                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15403                         next = h->dtha_next;
15404
15405                         if (h->dtha_generation == gen) {
15406                                 if (last != NULL) {
15407                                         last->dtha_next = next;
15408                                 } else {
15409                                         help->dthps_actions[i] = next;
15410                                 }
15411
15412                                 dtrace_helper_action_destroy(h, vstate);
15413                         } else {
15414                                 last = h;
15415                         }
15416                 }
15417         }
15418
15419         /*
15420          * Interate until we've cleared out all helper providers with the
15421          * given generation number.
15422          */
15423         for (;;) {
15424                 dtrace_helper_provider_t *prov;
15425
15426                 /*
15427                  * Look for a helper provider with the right generation. We
15428                  * have to start back at the beginning of the list each time
15429                  * because we drop dtrace_lock. It's unlikely that we'll make
15430                  * more than two passes.
15431                  */
15432                 for (i = 0; i < help->dthps_nprovs; i++) {
15433                         prov = help->dthps_provs[i];
15434
15435                         if (prov->dthp_generation == gen)
15436                                 break;
15437                 }
15438
15439                 /*
15440                  * If there were no matches, we're done.
15441                  */
15442                 if (i == help->dthps_nprovs)
15443                         break;
15444
15445                 /*
15446                  * Move the last helper provider into this slot.
15447                  */
15448                 help->dthps_nprovs--;
15449                 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15450                 help->dthps_provs[help->dthps_nprovs] = NULL;
15451
15452                 mutex_exit(&dtrace_lock);
15453
15454                 /*
15455                  * If we have a meta provider, remove this helper provider.
15456                  */
15457                 mutex_enter(&dtrace_meta_lock);
15458                 if (dtrace_meta_pid != NULL) {
15459                         ASSERT(dtrace_deferred_pid == NULL);
15460                         dtrace_helper_provider_remove(&prov->dthp_prov,
15461                             p->p_pid);
15462                 }
15463                 mutex_exit(&dtrace_meta_lock);
15464
15465                 dtrace_helper_provider_destroy(prov);
15466
15467                 mutex_enter(&dtrace_lock);
15468         }
15469
15470         return (0);
15471 }
15472
15473 static int
15474 dtrace_helper_validate(dtrace_helper_action_t *helper)
15475 {
15476         int err = 0, i;
15477         dtrace_difo_t *dp;
15478
15479         if ((dp = helper->dtha_predicate) != NULL)
15480                 err += dtrace_difo_validate_helper(dp);
15481
15482         for (i = 0; i < helper->dtha_nactions; i++)
15483                 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15484
15485         return (err == 0);
15486 }
15487
15488 static int
15489 dtrace_helper_action_add(int which, dtrace_ecbdesc_t *ep)
15490 {
15491         dtrace_helpers_t *help;
15492         dtrace_helper_action_t *helper, *last;
15493         dtrace_actdesc_t *act;
15494         dtrace_vstate_t *vstate;
15495         dtrace_predicate_t *pred;
15496         int count = 0, nactions = 0, i;
15497
15498         if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15499                 return (EINVAL);
15500
15501         help = curproc->p_dtrace_helpers;
15502         last = help->dthps_actions[which];
15503         vstate = &help->dthps_vstate;
15504
15505         for (count = 0; last != NULL; last = last->dtha_next) {
15506                 count++;
15507                 if (last->dtha_next == NULL)
15508                         break;
15509         }
15510
15511         /*
15512          * If we already have dtrace_helper_actions_max helper actions for this
15513          * helper action type, we'll refuse to add a new one.
15514          */
15515         if (count >= dtrace_helper_actions_max)
15516                 return (ENOSPC);
15517
15518         helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15519         helper->dtha_generation = help->dthps_generation;
15520
15521         if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15522                 ASSERT(pred->dtp_difo != NULL);
15523                 dtrace_difo_hold(pred->dtp_difo);
15524                 helper->dtha_predicate = pred->dtp_difo;
15525         }
15526
15527         for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15528                 if (act->dtad_kind != DTRACEACT_DIFEXPR)
15529                         goto err;
15530
15531                 if (act->dtad_difo == NULL)
15532                         goto err;
15533
15534                 nactions++;
15535         }
15536
15537         helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15538             (helper->dtha_nactions = nactions), KM_SLEEP);
15539
15540         for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15541                 dtrace_difo_hold(act->dtad_difo);
15542                 helper->dtha_actions[i++] = act->dtad_difo;
15543         }
15544
15545         if (!dtrace_helper_validate(helper))
15546                 goto err;
15547
15548         if (last == NULL) {
15549                 help->dthps_actions[which] = helper;
15550         } else {
15551                 last->dtha_next = helper;
15552         }
15553
15554         if (vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15555                 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15556                 dtrace_helptrace_next = 0;
15557         }
15558
15559         return (0);
15560 err:
15561         dtrace_helper_action_destroy(helper, vstate);
15562         return (EINVAL);
15563 }
15564
15565 static void
15566 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15567     dof_helper_t *dofhp)
15568 {
15569         ASSERT(MUTEX_NOT_HELD(&dtrace_lock));
15570
15571         mutex_enter(&dtrace_meta_lock);
15572         mutex_enter(&dtrace_lock);
15573
15574         if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15575                 /*
15576                  * If the dtrace module is loaded but not attached, or if
15577                  * there aren't isn't a meta provider registered to deal with
15578                  * these provider descriptions, we need to postpone creating
15579                  * the actual providers until later.
15580                  */
15581
15582                 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15583                     dtrace_deferred_pid != help) {
15584                         help->dthps_deferred = 1;
15585                         help->dthps_pid = p->p_pid;
15586                         help->dthps_next = dtrace_deferred_pid;
15587                         help->dthps_prev = NULL;
15588                         if (dtrace_deferred_pid != NULL)
15589                                 dtrace_deferred_pid->dthps_prev = help;
15590                         dtrace_deferred_pid = help;
15591                 }
15592
15593                 mutex_exit(&dtrace_lock);
15594
15595         } else if (dofhp != NULL) {
15596                 /*
15597                  * If the dtrace module is loaded and we have a particular
15598                  * helper provider description, pass that off to the
15599                  * meta provider.
15600                  */
15601
15602                 mutex_exit(&dtrace_lock);
15603
15604                 dtrace_helper_provide(dofhp, p->p_pid);
15605
15606         } else {
15607                 /*
15608                  * Otherwise, just pass all the helper provider descriptions
15609                  * off to the meta provider.
15610                  */
15611
15612                 int i;
15613                 mutex_exit(&dtrace_lock);
15614
15615                 for (i = 0; i < help->dthps_nprovs; i++) {
15616                         dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15617                             p->p_pid);
15618                 }
15619         }
15620
15621         mutex_exit(&dtrace_meta_lock);
15622 }
15623
15624 static int
15625 dtrace_helper_provider_add(dof_helper_t *dofhp, int gen)
15626 {
15627         dtrace_helpers_t *help;
15628         dtrace_helper_provider_t *hprov, **tmp_provs;
15629         uint_t tmp_maxprovs, i;
15630
15631         ASSERT(MUTEX_HELD(&dtrace_lock));
15632
15633         help = curproc->p_dtrace_helpers;
15634         ASSERT(help != NULL);
15635
15636         /*
15637          * If we already have dtrace_helper_providers_max helper providers,
15638          * we're refuse to add a new one.
15639          */
15640         if (help->dthps_nprovs >= dtrace_helper_providers_max)
15641                 return (ENOSPC);
15642
15643         /*
15644          * Check to make sure this isn't a duplicate.
15645          */
15646         for (i = 0; i < help->dthps_nprovs; i++) {
15647                 if (dofhp->dofhp_dof ==
15648                     help->dthps_provs[i]->dthp_prov.dofhp_dof)
15649                         return (EALREADY);
15650         }
15651
15652         hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15653         hprov->dthp_prov = *dofhp;
15654         hprov->dthp_ref = 1;
15655         hprov->dthp_generation = gen;
15656
15657         /*
15658          * Allocate a bigger table for helper providers if it's already full.
15659          */
15660         if (help->dthps_maxprovs == help->dthps_nprovs) {
15661                 tmp_maxprovs = help->dthps_maxprovs;
15662                 tmp_provs = help->dthps_provs;
15663
15664                 if (help->dthps_maxprovs == 0)
15665                         help->dthps_maxprovs = 2;
15666                 else
15667                         help->dthps_maxprovs *= 2;
15668                 if (help->dthps_maxprovs > dtrace_helper_providers_max)
15669                         help->dthps_maxprovs = dtrace_helper_providers_max;
15670
15671                 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15672
15673                 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15674                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15675
15676                 if (tmp_provs != NULL) {
15677                         bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15678                             sizeof (dtrace_helper_provider_t *));
15679                         kmem_free(tmp_provs, tmp_maxprovs *
15680                             sizeof (dtrace_helper_provider_t *));
15681                 }
15682         }
15683
15684         help->dthps_provs[help->dthps_nprovs] = hprov;
15685         help->dthps_nprovs++;
15686
15687         return (0);
15688 }
15689
15690 static void
15691 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15692 {
15693         mutex_enter(&dtrace_lock);
15694
15695         if (--hprov->dthp_ref == 0) {
15696                 dof_hdr_t *dof;
15697                 mutex_exit(&dtrace_lock);
15698                 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15699                 dtrace_dof_destroy(dof);
15700                 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15701         } else {
15702                 mutex_exit(&dtrace_lock);
15703         }
15704 }
15705
15706 static int
15707 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15708 {
15709         uintptr_t daddr = (uintptr_t)dof;
15710         dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15711         dof_provider_t *provider;
15712         dof_probe_t *probe;
15713         uint8_t *arg;
15714         char *strtab, *typestr;
15715         dof_stridx_t typeidx;
15716         size_t typesz;
15717         uint_t nprobes, j, k;
15718
15719         ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15720
15721         if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15722                 dtrace_dof_error(dof, "misaligned section offset");
15723                 return (-1);
15724         }
15725
15726         /*
15727          * The section needs to be large enough to contain the DOF provider
15728          * structure appropriate for the given version.
15729          */
15730         if (sec->dofs_size <
15731             ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15732             offsetof(dof_provider_t, dofpv_prenoffs) :
15733             sizeof (dof_provider_t))) {
15734                 dtrace_dof_error(dof, "provider section too small");
15735                 return (-1);
15736         }
15737
15738         provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15739         str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15740         prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15741         arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15742         off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15743
15744         if (str_sec == NULL || prb_sec == NULL ||
15745             arg_sec == NULL || off_sec == NULL)
15746                 return (-1);
15747
15748         enoff_sec = NULL;
15749
15750         if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15751             provider->dofpv_prenoffs != DOF_SECT_NONE &&
15752             (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15753             provider->dofpv_prenoffs)) == NULL)
15754                 return (-1);
15755
15756         strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15757
15758         if (provider->dofpv_name >= str_sec->dofs_size ||
15759             strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15760                 dtrace_dof_error(dof, "invalid provider name");
15761                 return (-1);
15762         }
15763
15764         if (prb_sec->dofs_entsize == 0 ||
15765             prb_sec->dofs_entsize > prb_sec->dofs_size) {
15766                 dtrace_dof_error(dof, "invalid entry size");
15767                 return (-1);
15768         }
15769
15770         if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15771                 dtrace_dof_error(dof, "misaligned entry size");
15772                 return (-1);
15773         }
15774
15775         if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15776                 dtrace_dof_error(dof, "invalid entry size");
15777                 return (-1);
15778         }
15779
15780         if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15781                 dtrace_dof_error(dof, "misaligned section offset");
15782                 return (-1);
15783         }
15784
15785         if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15786                 dtrace_dof_error(dof, "invalid entry size");
15787                 return (-1);
15788         }
15789
15790         arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15791
15792         nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15793
15794         /*
15795          * Take a pass through the probes to check for errors.
15796          */
15797         for (j = 0; j < nprobes; j++) {
15798                 probe = (dof_probe_t *)(uintptr_t)(daddr +
15799                     prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15800
15801                 if (probe->dofpr_func >= str_sec->dofs_size) {
15802                         dtrace_dof_error(dof, "invalid function name");
15803                         return (-1);
15804                 }
15805
15806                 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15807                         dtrace_dof_error(dof, "function name too long");
15808                         return (-1);
15809                 }
15810
15811                 if (probe->dofpr_name >= str_sec->dofs_size ||
15812                     strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15813                         dtrace_dof_error(dof, "invalid probe name");
15814                         return (-1);
15815                 }
15816
15817                 /*
15818                  * The offset count must not wrap the index, and the offsets
15819                  * must also not overflow the section's data.
15820                  */
15821                 if (probe->dofpr_offidx + probe->dofpr_noffs <
15822                     probe->dofpr_offidx ||
15823                     (probe->dofpr_offidx + probe->dofpr_noffs) *
15824                     off_sec->dofs_entsize > off_sec->dofs_size) {
15825                         dtrace_dof_error(dof, "invalid probe offset");
15826                         return (-1);
15827                 }
15828
15829                 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15830                         /*
15831                          * If there's no is-enabled offset section, make sure
15832                          * there aren't any is-enabled offsets. Otherwise
15833                          * perform the same checks as for probe offsets
15834                          * (immediately above).
15835                          */
15836                         if (enoff_sec == NULL) {
15837                                 if (probe->dofpr_enoffidx != 0 ||
15838                                     probe->dofpr_nenoffs != 0) {
15839                                         dtrace_dof_error(dof, "is-enabled "
15840                                             "offsets with null section");
15841                                         return (-1);
15842                                 }
15843                         } else if (probe->dofpr_enoffidx +
15844                             probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15845                             (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15846                             enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15847                                 dtrace_dof_error(dof, "invalid is-enabled "
15848                                     "offset");
15849                                 return (-1);
15850                         }
15851
15852                         if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15853                                 dtrace_dof_error(dof, "zero probe and "
15854                                     "is-enabled offsets");
15855                                 return (-1);
15856                         }
15857                 } else if (probe->dofpr_noffs == 0) {
15858                         dtrace_dof_error(dof, "zero probe offsets");
15859                         return (-1);
15860                 }
15861
15862                 if (probe->dofpr_argidx + probe->dofpr_xargc <
15863                     probe->dofpr_argidx ||
15864                     (probe->dofpr_argidx + probe->dofpr_xargc) *
15865                     arg_sec->dofs_entsize > arg_sec->dofs_size) {
15866                         dtrace_dof_error(dof, "invalid args");
15867                         return (-1);
15868                 }
15869
15870                 typeidx = probe->dofpr_nargv;
15871                 typestr = strtab + probe->dofpr_nargv;
15872                 for (k = 0; k < probe->dofpr_nargc; k++) {
15873                         if (typeidx >= str_sec->dofs_size) {
15874                                 dtrace_dof_error(dof, "bad "
15875                                     "native argument type");
15876                                 return (-1);
15877                         }
15878
15879                         typesz = strlen(typestr) + 1;
15880                         if (typesz > DTRACE_ARGTYPELEN) {
15881                                 dtrace_dof_error(dof, "native "
15882                                     "argument type too long");
15883                                 return (-1);
15884                         }
15885                         typeidx += typesz;
15886                         typestr += typesz;
15887                 }
15888
15889                 typeidx = probe->dofpr_xargv;
15890                 typestr = strtab + probe->dofpr_xargv;
15891                 for (k = 0; k < probe->dofpr_xargc; k++) {
15892                         if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15893                                 dtrace_dof_error(dof, "bad "
15894                                     "native argument index");
15895                                 return (-1);
15896                         }
15897
15898                         if (typeidx >= str_sec->dofs_size) {
15899                                 dtrace_dof_error(dof, "bad "
15900                                     "translated argument type");
15901                                 return (-1);
15902                         }
15903
15904                         typesz = strlen(typestr) + 1;
15905                         if (typesz > DTRACE_ARGTYPELEN) {
15906                                 dtrace_dof_error(dof, "translated argument "
15907                                     "type too long");
15908                                 return (-1);
15909                         }
15910
15911                         typeidx += typesz;
15912                         typestr += typesz;
15913                 }
15914         }
15915
15916         return (0);
15917 }
15918
15919 static int
15920 dtrace_helper_slurp(dof_hdr_t *dof, dof_helper_t *dhp)
15921 {
15922         dtrace_helpers_t *help;
15923         dtrace_vstate_t *vstate;
15924         dtrace_enabling_t *enab = NULL;
15925         int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15926         uintptr_t daddr = (uintptr_t)dof;
15927
15928         ASSERT(MUTEX_HELD(&dtrace_lock));
15929
15930         if ((help = curproc->p_dtrace_helpers) == NULL)
15931                 help = dtrace_helpers_create(curproc);
15932
15933         vstate = &help->dthps_vstate;
15934
15935         if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15936             dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15937                 dtrace_dof_destroy(dof);
15938                 return (rv);
15939         }
15940
15941         /*
15942          * Look for helper providers and validate their descriptions.
15943          */
15944         if (dhp != NULL) {
15945                 for (i = 0; i < dof->dofh_secnum; i++) {
15946                         dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15947                             dof->dofh_secoff + i * dof->dofh_secsize);
15948
15949                         if (sec->dofs_type != DOF_SECT_PROVIDER)
15950                                 continue;
15951
15952                         if (dtrace_helper_provider_validate(dof, sec) != 0) {
15953                                 dtrace_enabling_destroy(enab);
15954                                 dtrace_dof_destroy(dof);
15955                                 return (-1);
15956                         }
15957
15958                         nprovs++;
15959                 }
15960         }
15961
15962         /*
15963          * Now we need to walk through the ECB descriptions in the enabling.
15964          */
15965         for (i = 0; i < enab->dten_ndesc; i++) {
15966                 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15967                 dtrace_probedesc_t *desc = &ep->dted_probe;
15968
15969                 if (strcmp(desc->dtpd_provider, "dtrace") != 0)
15970                         continue;
15971
15972                 if (strcmp(desc->dtpd_mod, "helper") != 0)
15973                         continue;
15974
15975                 if (strcmp(desc->dtpd_func, "ustack") != 0)
15976                         continue;
15977
15978                 if ((rv = dtrace_helper_action_add(DTRACE_HELPER_ACTION_USTACK,
15979                     ep)) != 0) {
15980                         /*
15981                          * Adding this helper action failed -- we are now going
15982                          * to rip out the entire generation and return failure.
15983                          */
15984                         (void) dtrace_helper_destroygen(help->dthps_generation);
15985                         dtrace_enabling_destroy(enab);
15986                         dtrace_dof_destroy(dof);
15987                         return (-1);
15988                 }
15989
15990                 nhelpers++;
15991         }
15992
15993         if (nhelpers < enab->dten_ndesc)
15994                 dtrace_dof_error(dof, "unmatched helpers");
15995
15996         gen = help->dthps_generation++;
15997         dtrace_enabling_destroy(enab);
15998
15999         if (dhp != NULL && nprovs > 0) {
16000                 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
16001                 if (dtrace_helper_provider_add(dhp, gen) == 0) {
16002                         mutex_exit(&dtrace_lock);
16003                         dtrace_helper_provider_register(curproc, help, dhp);
16004                         mutex_enter(&dtrace_lock);
16005
16006                         destroy = 0;
16007                 }
16008         }
16009
16010         if (destroy)
16011                 dtrace_dof_destroy(dof);
16012
16013         return (gen);
16014 }
16015
16016 static dtrace_helpers_t *
16017 dtrace_helpers_create(proc_t *p)
16018 {
16019         dtrace_helpers_t *help;
16020
16021         ASSERT(MUTEX_HELD(&dtrace_lock));
16022         ASSERT(p->p_dtrace_helpers == NULL);
16023
16024         help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16025         help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16026             DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16027
16028         p->p_dtrace_helpers = help;
16029         dtrace_helpers++;
16030
16031         return (help);
16032 }
16033
16034 #if defined(sun)
16035 static
16036 #endif
16037 void
16038 dtrace_helpers_destroy(proc_t *p)
16039 {
16040         dtrace_helpers_t *help;
16041         dtrace_vstate_t *vstate;
16042 #if defined(sun)
16043         proc_t *p = curproc;
16044 #endif
16045         int i;
16046
16047         mutex_enter(&dtrace_lock);
16048
16049         ASSERT(p->p_dtrace_helpers != NULL);
16050         ASSERT(dtrace_helpers > 0);
16051
16052         help = p->p_dtrace_helpers;
16053         vstate = &help->dthps_vstate;
16054
16055         /*
16056          * We're now going to lose the help from this process.
16057          */
16058         p->p_dtrace_helpers = NULL;
16059         dtrace_sync();
16060
16061         /*
16062          * Destory the helper actions.
16063          */
16064         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16065                 dtrace_helper_action_t *h, *next;
16066
16067                 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16068                         next = h->dtha_next;
16069                         dtrace_helper_action_destroy(h, vstate);
16070                         h = next;
16071                 }
16072         }
16073
16074         mutex_exit(&dtrace_lock);
16075
16076         /*
16077          * Destroy the helper providers.
16078          */
16079         if (help->dthps_maxprovs > 0) {
16080                 mutex_enter(&dtrace_meta_lock);
16081                 if (dtrace_meta_pid != NULL) {
16082                         ASSERT(dtrace_deferred_pid == NULL);
16083
16084                         for (i = 0; i < help->dthps_nprovs; i++) {
16085                                 dtrace_helper_provider_remove(
16086                                     &help->dthps_provs[i]->dthp_prov, p->p_pid);
16087                         }
16088                 } else {
16089                         mutex_enter(&dtrace_lock);
16090                         ASSERT(help->dthps_deferred == 0 ||
16091                             help->dthps_next != NULL ||
16092                             help->dthps_prev != NULL ||
16093                             help == dtrace_deferred_pid);
16094
16095                         /*
16096                          * Remove the helper from the deferred list.
16097                          */
16098                         if (help->dthps_next != NULL)
16099                                 help->dthps_next->dthps_prev = help->dthps_prev;
16100                         if (help->dthps_prev != NULL)
16101                                 help->dthps_prev->dthps_next = help->dthps_next;
16102                         if (dtrace_deferred_pid == help) {
16103                                 dtrace_deferred_pid = help->dthps_next;
16104                                 ASSERT(help->dthps_prev == NULL);
16105                         }
16106
16107                         mutex_exit(&dtrace_lock);
16108                 }
16109
16110                 mutex_exit(&dtrace_meta_lock);
16111
16112                 for (i = 0; i < help->dthps_nprovs; i++) {
16113                         dtrace_helper_provider_destroy(help->dthps_provs[i]);
16114                 }
16115
16116                 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16117                     sizeof (dtrace_helper_provider_t *));
16118         }
16119
16120         mutex_enter(&dtrace_lock);
16121
16122         dtrace_vstate_fini(&help->dthps_vstate);
16123         kmem_free(help->dthps_actions,
16124             sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16125         kmem_free(help, sizeof (dtrace_helpers_t));
16126
16127         --dtrace_helpers;
16128         mutex_exit(&dtrace_lock);
16129 }
16130
16131 #if defined(sun)
16132 static
16133 #endif
16134 void
16135 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16136 {
16137         dtrace_helpers_t *help, *newhelp;
16138         dtrace_helper_action_t *helper, *new, *last;
16139         dtrace_difo_t *dp;
16140         dtrace_vstate_t *vstate;
16141         int i, j, sz, hasprovs = 0;
16142
16143         mutex_enter(&dtrace_lock);
16144         ASSERT(from->p_dtrace_helpers != NULL);
16145         ASSERT(dtrace_helpers > 0);
16146
16147         help = from->p_dtrace_helpers;
16148         newhelp = dtrace_helpers_create(to);
16149         ASSERT(to->p_dtrace_helpers != NULL);
16150
16151         newhelp->dthps_generation = help->dthps_generation;
16152         vstate = &newhelp->dthps_vstate;
16153
16154         /*
16155          * Duplicate the helper actions.
16156          */
16157         for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16158                 if ((helper = help->dthps_actions[i]) == NULL)
16159                         continue;
16160
16161                 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16162                         new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16163                             KM_SLEEP);
16164                         new->dtha_generation = helper->dtha_generation;
16165
16166                         if ((dp = helper->dtha_predicate) != NULL) {
16167                                 dp = dtrace_difo_duplicate(dp, vstate);
16168                                 new->dtha_predicate = dp;
16169                         }
16170
16171                         new->dtha_nactions = helper->dtha_nactions;
16172                         sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16173                         new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16174
16175                         for (j = 0; j < new->dtha_nactions; j++) {
16176                                 dtrace_difo_t *dp = helper->dtha_actions[j];
16177
16178                                 ASSERT(dp != NULL);
16179                                 dp = dtrace_difo_duplicate(dp, vstate);
16180                                 new->dtha_actions[j] = dp;
16181                         }
16182
16183                         if (last != NULL) {
16184                                 last->dtha_next = new;
16185                         } else {
16186                                 newhelp->dthps_actions[i] = new;
16187                         }
16188
16189                         last = new;
16190                 }
16191         }
16192
16193         /*
16194          * Duplicate the helper providers and register them with the
16195          * DTrace framework.
16196          */
16197         if (help->dthps_nprovs > 0) {
16198                 newhelp->dthps_nprovs = help->dthps_nprovs;
16199                 newhelp->dthps_maxprovs = help->dthps_nprovs;
16200                 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16201                     sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16202                 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16203                         newhelp->dthps_provs[i] = help->dthps_provs[i];
16204                         newhelp->dthps_provs[i]->dthp_ref++;
16205                 }
16206
16207                 hasprovs = 1;
16208         }
16209
16210         mutex_exit(&dtrace_lock);
16211
16212         if (hasprovs)
16213                 dtrace_helper_provider_register(to, newhelp, NULL);
16214 }
16215
16216 /*
16217  * DTrace Hook Functions
16218  */
16219 static void
16220 dtrace_module_loaded(modctl_t *ctl)
16221 {
16222         dtrace_provider_t *prv;
16223
16224         mutex_enter(&dtrace_provider_lock);
16225 #if defined(sun)
16226         mutex_enter(&mod_lock);
16227 #endif
16228
16229 #if defined(sun)
16230         ASSERT(ctl->mod_busy);
16231 #endif
16232
16233         /*
16234          * We're going to call each providers per-module provide operation
16235          * specifying only this module.
16236          */
16237         for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16238                 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16239
16240 #if defined(sun)
16241         mutex_exit(&mod_lock);
16242 #endif
16243         mutex_exit(&dtrace_provider_lock);
16244
16245         /*
16246          * If we have any retained enablings, we need to match against them.
16247          * Enabling probes requires that cpu_lock be held, and we cannot hold
16248          * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16249          * module.  (In particular, this happens when loading scheduling
16250          * classes.)  So if we have any retained enablings, we need to dispatch
16251          * our task queue to do the match for us.
16252          */
16253         mutex_enter(&dtrace_lock);
16254
16255         if (dtrace_retained == NULL) {
16256                 mutex_exit(&dtrace_lock);
16257                 return;
16258         }
16259
16260         (void) taskq_dispatch(dtrace_taskq,
16261             (task_func_t *)dtrace_enabling_matchall, NULL, TQ_SLEEP);
16262
16263         mutex_exit(&dtrace_lock);
16264
16265         /*
16266          * And now, for a little heuristic sleaze:  in general, we want to
16267          * match modules as soon as they load.  However, we cannot guarantee
16268          * this, because it would lead us to the lock ordering violation
16269          * outlined above.  The common case, of course, is that cpu_lock is
16270          * _not_ held -- so we delay here for a clock tick, hoping that that's
16271          * long enough for the task queue to do its work.  If it's not, it's
16272          * not a serious problem -- it just means that the module that we
16273          * just loaded may not be immediately instrumentable.
16274          */
16275         delay(1);
16276 }
16277
16278 static void
16279 #if defined(sun)
16280 dtrace_module_unloaded(modctl_t *ctl)
16281 #else
16282 dtrace_module_unloaded(modctl_t *ctl, int *error)
16283 #endif
16284 {
16285         dtrace_probe_t template, *probe, *first, *next;
16286         dtrace_provider_t *prov;
16287 #if !defined(sun)
16288         char modname[DTRACE_MODNAMELEN];
16289         size_t len;
16290 #endif
16291
16292 #if defined(sun)
16293         template.dtpr_mod = ctl->mod_modname;
16294 #else
16295         /* Handle the fact that ctl->filename may end in ".ko". */
16296         strlcpy(modname, ctl->filename, sizeof(modname));
16297         len = strlen(ctl->filename);
16298         if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
16299                 modname[len - 3] = '\0';
16300         template.dtpr_mod = modname;
16301 #endif
16302
16303         mutex_enter(&dtrace_provider_lock);
16304 #if defined(sun)
16305         mutex_enter(&mod_lock);
16306 #endif
16307         mutex_enter(&dtrace_lock);
16308
16309 #if !defined(sun)
16310         if (ctl->nenabled > 0) {
16311                 /* Don't allow unloads if a probe is enabled. */
16312                 mutex_exit(&dtrace_provider_lock);
16313                 mutex_exit(&dtrace_lock);
16314                 *error = -1;
16315                 printf(
16316         "kldunload: attempt to unload module that has DTrace probes enabled\n");
16317                 return;
16318         }
16319 #endif
16320
16321         if (dtrace_bymod == NULL) {
16322                 /*
16323                  * The DTrace module is loaded (obviously) but not attached;
16324                  * we don't have any work to do.
16325                  */
16326                 mutex_exit(&dtrace_provider_lock);
16327 #if defined(sun)
16328                 mutex_exit(&mod_lock);
16329 #endif
16330                 mutex_exit(&dtrace_lock);
16331                 return;
16332         }
16333
16334         for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
16335             probe != NULL; probe = probe->dtpr_nextmod) {
16336                 if (probe->dtpr_ecb != NULL) {
16337                         mutex_exit(&dtrace_provider_lock);
16338 #if defined(sun)
16339                         mutex_exit(&mod_lock);
16340 #endif
16341                         mutex_exit(&dtrace_lock);
16342
16343                         /*
16344                          * This shouldn't _actually_ be possible -- we're
16345                          * unloading a module that has an enabled probe in it.
16346                          * (It's normally up to the provider to make sure that
16347                          * this can't happen.)  However, because dtps_enable()
16348                          * doesn't have a failure mode, there can be an
16349                          * enable/unload race.  Upshot:  we don't want to
16350                          * assert, but we're not going to disable the
16351                          * probe, either.
16352                          */
16353                         if (dtrace_err_verbose) {
16354 #if defined(sun)
16355                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16356                                     "enabled probes", ctl->mod_modname);
16357 #else
16358                                 cmn_err(CE_WARN, "unloaded module '%s' had "
16359                                     "enabled probes", modname);
16360 #endif
16361                         }
16362
16363                         return;
16364                 }
16365         }
16366
16367         probe = first;
16368
16369         for (first = NULL; probe != NULL; probe = next) {
16370                 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
16371
16372                 dtrace_probes[probe->dtpr_id - 1] = NULL;
16373
16374                 next = probe->dtpr_nextmod;
16375                 dtrace_hash_remove(dtrace_bymod, probe);
16376                 dtrace_hash_remove(dtrace_byfunc, probe);
16377                 dtrace_hash_remove(dtrace_byname, probe);
16378
16379                 if (first == NULL) {
16380                         first = probe;
16381                         probe->dtpr_nextmod = NULL;
16382                 } else {
16383                         probe->dtpr_nextmod = first;
16384                         first = probe;
16385                 }
16386         }
16387
16388         /*
16389          * We've removed all of the module's probes from the hash chains and
16390          * from the probe array.  Now issue a dtrace_sync() to be sure that
16391          * everyone has cleared out from any probe array processing.
16392          */
16393         dtrace_sync();
16394
16395         for (probe = first; probe != NULL; probe = first) {
16396                 first = probe->dtpr_nextmod;
16397                 prov = probe->dtpr_provider;
16398                 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
16399                     probe->dtpr_arg);
16400                 kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
16401                 kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
16402                 kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
16403 #if defined(sun)
16404                 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
16405 #else
16406                 free_unr(dtrace_arena, probe->dtpr_id);
16407 #endif
16408                 kmem_free(probe, sizeof (dtrace_probe_t));
16409         }
16410
16411         mutex_exit(&dtrace_lock);
16412 #if defined(sun)
16413         mutex_exit(&mod_lock);
16414 #endif
16415         mutex_exit(&dtrace_provider_lock);
16416 }
16417
16418 #if !defined(sun)
16419 static void
16420 dtrace_kld_load(void *arg __unused, linker_file_t lf)
16421 {
16422
16423         dtrace_module_loaded(lf);
16424 }
16425
16426 static void
16427 dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
16428 {
16429
16430         if (*error != 0)
16431                 /* We already have an error, so don't do anything. */
16432                 return;
16433         dtrace_module_unloaded(lf, error);
16434 }
16435 #endif
16436
16437 #if defined(sun)
16438 static void
16439 dtrace_suspend(void)
16440 {
16441         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
16442 }
16443
16444 static void
16445 dtrace_resume(void)
16446 {
16447         dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
16448 }
16449 #endif
16450
16451 static int
16452 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
16453 {
16454         ASSERT(MUTEX_HELD(&cpu_lock));
16455         mutex_enter(&dtrace_lock);
16456
16457         switch (what) {
16458         case CPU_CONFIG: {
16459                 dtrace_state_t *state;
16460                 dtrace_optval_t *opt, rs, c;
16461
16462                 /*
16463                  * For now, we only allocate a new buffer for anonymous state.
16464                  */
16465                 if ((state = dtrace_anon.dta_state) == NULL)
16466                         break;
16467
16468                 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
16469                         break;
16470
16471                 opt = state->dts_options;
16472                 c = opt[DTRACEOPT_CPU];
16473
16474                 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
16475                         break;
16476
16477                 /*
16478                  * Regardless of what the actual policy is, we're going to
16479                  * temporarily set our resize policy to be manual.  We're
16480                  * also going to temporarily set our CPU option to denote
16481                  * the newly configured CPU.
16482                  */
16483                 rs = opt[DTRACEOPT_BUFRESIZE];
16484                 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
16485                 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
16486
16487                 (void) dtrace_state_buffers(state);
16488
16489                 opt[DTRACEOPT_BUFRESIZE] = rs;
16490                 opt[DTRACEOPT_CPU] = c;
16491
16492                 break;
16493         }
16494
16495         case CPU_UNCONFIG:
16496                 /*
16497                  * We don't free the buffer in the CPU_UNCONFIG case.  (The
16498                  * buffer will be freed when the consumer exits.)
16499                  */
16500                 break;
16501
16502         default:
16503                 break;
16504         }
16505
16506         mutex_exit(&dtrace_lock);
16507         return (0);
16508 }
16509
16510 #if defined(sun)
16511 static void
16512 dtrace_cpu_setup_initial(processorid_t cpu)
16513 {
16514         (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
16515 }
16516 #endif
16517
16518 static void
16519 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
16520 {
16521         if (dtrace_toxranges >= dtrace_toxranges_max) {
16522                 int osize, nsize;
16523                 dtrace_toxrange_t *range;
16524
16525                 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16526
16527                 if (osize == 0) {
16528                         ASSERT(dtrace_toxrange == NULL);
16529                         ASSERT(dtrace_toxranges_max == 0);
16530                         dtrace_toxranges_max = 1;
16531                 } else {
16532                         dtrace_toxranges_max <<= 1;
16533                 }
16534
16535                 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
16536                 range = kmem_zalloc(nsize, KM_SLEEP);
16537
16538                 if (dtrace_toxrange != NULL) {
16539                         ASSERT(osize != 0);
16540                         bcopy(dtrace_toxrange, range, osize);
16541                         kmem_free(dtrace_toxrange, osize);
16542                 }
16543
16544                 dtrace_toxrange = range;
16545         }
16546
16547         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
16548         ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
16549
16550         dtrace_toxrange[dtrace_toxranges].dtt_base = base;
16551         dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
16552         dtrace_toxranges++;
16553 }
16554
16555 static void
16556 dtrace_getf_barrier()
16557 {
16558 #if defined(sun)
16559         /*
16560          * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
16561          * that contain calls to getf(), this routine will be called on every
16562          * closef() before either the underlying vnode is released or the
16563          * file_t itself is freed.  By the time we are here, it is essential
16564          * that the file_t can no longer be accessed from a call to getf()
16565          * in probe context -- that assures that a dtrace_sync() can be used
16566          * to clear out any enablings referring to the old structures.
16567          */
16568         if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
16569             kcred->cr_zone->zone_dtrace_getf != 0)
16570                 dtrace_sync();
16571 #endif
16572 }
16573
16574 /*
16575  * DTrace Driver Cookbook Functions
16576  */
16577 #if defined(sun)
16578 /*ARGSUSED*/
16579 static int
16580 dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
16581 {
16582         dtrace_provider_id_t id;
16583         dtrace_state_t *state = NULL;
16584         dtrace_enabling_t *enab;
16585
16586         mutex_enter(&cpu_lock);
16587         mutex_enter(&dtrace_provider_lock);
16588         mutex_enter(&dtrace_lock);
16589
16590         if (ddi_soft_state_init(&dtrace_softstate,
16591             sizeof (dtrace_state_t), 0) != 0) {
16592                 cmn_err(CE_NOTE, "/dev/dtrace failed to initialize soft state");
16593                 mutex_exit(&cpu_lock);
16594                 mutex_exit(&dtrace_provider_lock);
16595                 mutex_exit(&dtrace_lock);
16596                 return (DDI_FAILURE);
16597         }
16598
16599         if (ddi_create_minor_node(devi, DTRACEMNR_DTRACE, S_IFCHR,
16600             DTRACEMNRN_DTRACE, DDI_PSEUDO, NULL) == DDI_FAILURE ||
16601             ddi_create_minor_node(devi, DTRACEMNR_HELPER, S_IFCHR,
16602             DTRACEMNRN_HELPER, DDI_PSEUDO, NULL) == DDI_FAILURE) {
16603                 cmn_err(CE_NOTE, "/dev/dtrace couldn't create minor nodes");
16604                 ddi_remove_minor_node(devi, NULL);
16605                 ddi_soft_state_fini(&dtrace_softstate);
16606                 mutex_exit(&cpu_lock);
16607                 mutex_exit(&dtrace_provider_lock);
16608                 mutex_exit(&dtrace_lock);
16609                 return (DDI_FAILURE);
16610         }
16611
16612         ddi_report_dev(devi);
16613         dtrace_devi = devi;
16614
16615         dtrace_modload = dtrace_module_loaded;
16616         dtrace_modunload = dtrace_module_unloaded;
16617         dtrace_cpu_init = dtrace_cpu_setup_initial;
16618         dtrace_helpers_cleanup = dtrace_helpers_destroy;
16619         dtrace_helpers_fork = dtrace_helpers_duplicate;
16620         dtrace_cpustart_init = dtrace_suspend;
16621         dtrace_cpustart_fini = dtrace_resume;
16622         dtrace_debugger_init = dtrace_suspend;
16623         dtrace_debugger_fini = dtrace_resume;
16624
16625         register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
16626
16627         ASSERT(MUTEX_HELD(&cpu_lock));
16628
16629         dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
16630             NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
16631         dtrace_minor = vmem_create("dtrace_minor", (void *)DTRACEMNRN_CLONE,
16632             UINT32_MAX - DTRACEMNRN_CLONE, 1, NULL, NULL, NULL, 0,
16633             VM_SLEEP | VMC_IDENTIFIER);
16634         dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
16635             1, INT_MAX, 0);
16636
16637         dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
16638             sizeof (dtrace_dstate_percpu_t) * NCPU, DTRACE_STATE_ALIGN,
16639             NULL, NULL, NULL, NULL, NULL, 0);
16640
16641         ASSERT(MUTEX_HELD(&cpu_lock));
16642         dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
16643             offsetof(dtrace_probe_t, dtpr_nextmod),
16644             offsetof(dtrace_probe_t, dtpr_prevmod));
16645
16646         dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
16647             offsetof(dtrace_probe_t, dtpr_nextfunc),
16648             offsetof(dtrace_probe_t, dtpr_prevfunc));
16649
16650         dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
16651             offsetof(dtrace_probe_t, dtpr_nextname),
16652             offsetof(dtrace_probe_t, dtpr_prevname));
16653
16654         if (dtrace_retain_max < 1) {
16655                 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
16656                     "setting to 1", dtrace_retain_max);
16657                 dtrace_retain_max = 1;
16658         }
16659
16660         /*
16661          * Now discover our toxic ranges.
16662          */
16663         dtrace_toxic_ranges(dtrace_toxrange_add);
16664
16665         /*
16666          * Before we register ourselves as a provider to our own framework,
16667          * we would like to assert that dtrace_provider is NULL -- but that's
16668          * not true if we were loaded as a dependency of a DTrace provider.
16669          * Once we've registered, we can assert that dtrace_provider is our
16670          * pseudo provider.
16671          */
16672         (void) dtrace_register("dtrace", &dtrace_provider_attr,
16673             DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
16674
16675         ASSERT(dtrace_provider != NULL);
16676         ASSERT((dtrace_provider_id_t)dtrace_provider == id);
16677
16678         dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
16679             dtrace_provider, NULL, NULL, "BEGIN", 0, NULL);
16680         dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
16681             dtrace_provider, NULL, NULL, "END", 0, NULL);
16682         dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
16683             dtrace_provider, NULL, NULL, "ERROR", 1, NULL);
16684
16685         dtrace_anon_property();
16686         mutex_exit(&cpu_lock);
16687
16688         /*
16689          * If there are already providers, we must ask them to provide their
16690          * probes, and then match any anonymous enabling against them.  Note
16691          * that there should be no other retained enablings at this time:
16692          * the only retained enablings at this time should be the anonymous
16693          * enabling.
16694          */
16695         if (dtrace_anon.dta_enabling != NULL) {
16696                 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
16697
16698                 dtrace_enabling_provide(NULL);
16699                 state = dtrace_anon.dta_state;
16700
16701                 /*
16702                  * We couldn't hold cpu_lock across the above call to
16703                  * dtrace_enabling_provide(), but we must hold it to actually
16704                  * enable the probes.  We have to drop all of our locks, pick
16705                  * up cpu_lock, and regain our locks before matching the
16706                  * retained anonymous enabling.
16707                  */
16708                 mutex_exit(&dtrace_lock);
16709                 mutex_exit(&dtrace_provider_lock);
16710
16711                 mutex_enter(&cpu_lock);
16712                 mutex_enter(&dtrace_provider_lock);
16713                 mutex_enter(&dtrace_lock);
16714
16715                 if ((enab = dtrace_anon.dta_enabling) != NULL)
16716                         (void) dtrace_enabling_match(enab, NULL);
16717
16718                 mutex_exit(&cpu_lock);
16719         }
16720
16721         mutex_exit(&dtrace_lock);
16722         mutex_exit(&dtrace_provider_lock);
16723
16724         if (state != NULL) {
16725                 /*
16726                  * If we created any anonymous state, set it going now.
16727                  */
16728                 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
16729         }
16730
16731         return (DDI_SUCCESS);
16732 }
16733 #endif
16734
16735 #if !defined(sun)
16736 static void dtrace_dtr(void *);
16737 #endif
16738
16739 /*ARGSUSED*/
16740 static int
16741 #if defined(sun)
16742 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
16743 #else
16744 dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
16745 #endif
16746 {
16747         dtrace_state_t *state;
16748         uint32_t priv;
16749         uid_t uid;
16750         zoneid_t zoneid;
16751
16752 #if defined(sun)
16753         if (getminor(*devp) == DTRACEMNRN_HELPER)
16754                 return (0);
16755
16756         /*
16757          * If this wasn't an open with the "helper" minor, then it must be
16758          * the "dtrace" minor.
16759          */
16760         if (getminor(*devp) == DTRACEMNRN_DTRACE)
16761                 return (ENXIO);
16762 #else
16763         cred_t *cred_p = NULL;
16764         cred_p = dev->si_cred;
16765
16766         /*
16767          * If no DTRACE_PRIV_* bits are set in the credential, then the
16768          * caller lacks sufficient permission to do anything with DTrace.
16769          */
16770         dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
16771         if (priv == DTRACE_PRIV_NONE) {
16772 #endif
16773
16774                 return (EACCES);
16775         }
16776
16777         /*
16778          * Ask all providers to provide all their probes.
16779          */
16780         mutex_enter(&dtrace_provider_lock);
16781         dtrace_probe_provide(NULL, NULL);
16782         mutex_exit(&dtrace_provider_lock);
16783
16784         mutex_enter(&cpu_lock);
16785         mutex_enter(&dtrace_lock);
16786         dtrace_opens++;
16787         dtrace_membar_producer();
16788
16789 #if defined(sun)
16790         /*
16791          * If the kernel debugger is active (that is, if the kernel debugger
16792          * modified text in some way), we won't allow the open.
16793          */
16794         if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
16795                 dtrace_opens--;
16796                 mutex_exit(&cpu_lock);
16797                 mutex_exit(&dtrace_lock);
16798                 return (EBUSY);
16799         }
16800
16801         if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {
16802                 /*
16803                  * If DTrace helper tracing is enabled, we need to allocate the
16804                  * trace buffer and initialize the values.
16805                  */
16806                 dtrace_helptrace_buffer =
16807                     kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
16808                 dtrace_helptrace_next = 0;
16809                 dtrace_helptrace_wrapped = 0;
16810                 dtrace_helptrace_enable = 0;
16811         }
16812
16813         state = dtrace_state_create(devp, cred_p);
16814 #else
16815         state = dtrace_state_create(dev);
16816         devfs_set_cdevpriv(state, dtrace_dtr);
16817 #endif
16818
16819         mutex_exit(&cpu_lock);
16820
16821         if (state == NULL) {
16822 #if defined(sun)
16823                 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16824                         (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16825 #else
16826                 --dtrace_opens;
16827 #endif
16828                 mutex_exit(&dtrace_lock);
16829                 return (EAGAIN);
16830         }
16831
16832         mutex_exit(&dtrace_lock);
16833
16834         return (0);
16835 }
16836
16837 /*ARGSUSED*/
16838 #if defined(sun)
16839 static int
16840 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
16841 #else
16842 static void
16843 dtrace_dtr(void *data)
16844 #endif
16845 {
16846 #if defined(sun)
16847         minor_t minor = getminor(dev);
16848         dtrace_state_t *state;
16849 #endif
16850         dtrace_helptrace_t *buf = NULL;
16851
16852 #ifdef illumos
16853         if (minor == DTRACEMNRN_HELPER)
16854                 return (0);
16855
16856         state = ddi_get_soft_state(dtrace_softstate, minor);
16857 #else
16858         dtrace_state_t *state = data;
16859 #endif
16860
16861         mutex_enter(&cpu_lock);
16862         mutex_enter(&dtrace_lock);
16863
16864 #ifdef illumos
16865         if (state->dts_anon)
16866 #else
16867         if (state != NULL && state->dts_anon)
16868 #endif
16869         {
16870                 /*
16871                  * There is anonymous state. Destroy that first.
16872                  */
16873                 ASSERT(dtrace_anon.dta_state == NULL);
16874                 dtrace_state_destroy(state->dts_anon);
16875         }
16876
16877         if (dtrace_helptrace_disable) {
16878                 /*
16879                  * If we have been told to disable helper tracing, set the
16880                  * buffer to NULL before calling into dtrace_state_destroy();
16881                  * we take advantage of its dtrace_sync() to know that no
16882                  * CPU is in probe context with enabled helper tracing
16883                  * after it returns.
16884                  */
16885                 buf = dtrace_helptrace_buffer;
16886                 dtrace_helptrace_buffer = NULL;
16887         }
16888
16889 #ifdef illumos
16890         dtrace_state_destroy(state);
16891 #else
16892         if (state != NULL) {
16893                 dtrace_state_destroy(state);
16894                 kmem_free(state, 0);
16895         }
16896 #endif
16897         ASSERT(dtrace_opens > 0);
16898
16899 #if defined(sun)
16900         /*
16901          * Only relinquish control of the kernel debugger interface when there
16902          * are no consumers and no anonymous enablings.
16903          */
16904         if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
16905                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
16906 #else
16907         --dtrace_opens;
16908 #endif
16909
16910         if (buf != NULL) {
16911                 kmem_free(buf, dtrace_helptrace_bufsize);
16912                 dtrace_helptrace_disable = 0;
16913         }
16914
16915         mutex_exit(&dtrace_lock);
16916         mutex_exit(&cpu_lock);
16917
16918 #if defined(sun)
16919         return (0);
16920 #endif
16921 }
16922
16923 #if defined(sun)
16924 /*ARGSUSED*/
16925 static int
16926 dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
16927 {
16928         int rval;
16929         dof_helper_t help, *dhp = NULL;
16930
16931         switch (cmd) {
16932         case DTRACEHIOC_ADDDOF:
16933                 if (copyin((void *)arg, &help, sizeof (help)) != 0) {
16934                         dtrace_dof_error(NULL, "failed to copyin DOF helper");
16935                         return (EFAULT);
16936                 }
16937
16938                 dhp = &help;
16939                 arg = (intptr_t)help.dofhp_dof;
16940                 /*FALLTHROUGH*/
16941
16942         case DTRACEHIOC_ADD: {
16943                 dof_hdr_t *dof = dtrace_dof_copyin(arg, &rval);
16944
16945                 if (dof == NULL)
16946                         return (rval);
16947
16948                 mutex_enter(&dtrace_lock);
16949
16950                 /*
16951                  * dtrace_helper_slurp() takes responsibility for the dof --
16952                  * it may free it now or it may save it and free it later.
16953                  */
16954                 if ((rval = dtrace_helper_slurp(dof, dhp)) != -1) {
16955                         *rv = rval;
16956                         rval = 0;
16957                 } else {
16958                         rval = EINVAL;
16959                 }
16960
16961                 mutex_exit(&dtrace_lock);
16962                 return (rval);
16963         }
16964
16965         case DTRACEHIOC_REMOVE: {
16966                 mutex_enter(&dtrace_lock);
16967                 rval = dtrace_helper_destroygen(arg);
16968                 mutex_exit(&dtrace_lock);
16969
16970                 return (rval);
16971         }
16972
16973         default:
16974                 break;
16975         }
16976
16977         return (ENOTTY);
16978 }
16979
16980 /*ARGSUSED*/
16981 static int
16982 dtrace_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
16983 {
16984         minor_t minor = getminor(dev);
16985         dtrace_state_t *state;
16986         int rval;
16987
16988         if (minor == DTRACEMNRN_HELPER)
16989                 return (dtrace_ioctl_helper(cmd, arg, rv));
16990
16991         state = ddi_get_soft_state(dtrace_softstate, minor);
16992
16993         if (state->dts_anon) {
16994                 ASSERT(dtrace_anon.dta_state == NULL);
16995                 state = state->dts_anon;
16996         }
16997
16998         switch (cmd) {
16999         case DTRACEIOC_PROVIDER: {
17000                 dtrace_providerdesc_t pvd;
17001                 dtrace_provider_t *pvp;
17002
17003                 if (copyin((void *)arg, &pvd, sizeof (pvd)) != 0)
17004                         return (EFAULT);
17005
17006                 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17007                 mutex_enter(&dtrace_provider_lock);
17008
17009                 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17010                         if (strcmp(pvp->dtpv_name, pvd.dtvd_name) == 0)
17011                                 break;
17012                 }
17013
17014                 mutex_exit(&dtrace_provider_lock);
17015
17016                 if (pvp == NULL)
17017                         return (ESRCH);
17018
17019                 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17020                 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17021
17022                 if (copyout(&pvd, (void *)arg, sizeof (pvd)) != 0)
17023                         return (EFAULT);
17024
17025                 return (0);
17026         }
17027
17028         case DTRACEIOC_EPROBE: {
17029                 dtrace_eprobedesc_t epdesc;
17030                 dtrace_ecb_t *ecb;
17031                 dtrace_action_t *act;
17032                 void *buf;
17033                 size_t size;
17034                 uintptr_t dest;
17035                 int nrecs;
17036
17037                 if (copyin((void *)arg, &epdesc, sizeof (epdesc)) != 0)
17038                         return (EFAULT);
17039
17040                 mutex_enter(&dtrace_lock);
17041
17042                 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17043                         mutex_exit(&dtrace_lock);
17044                         return (EINVAL);
17045                 }
17046
17047                 if (ecb->dte_probe == NULL) {
17048                         mutex_exit(&dtrace_lock);
17049                         return (EINVAL);
17050                 }
17051
17052                 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17053                 epdesc.dtepd_uarg = ecb->dte_uarg;
17054                 epdesc.dtepd_size = ecb->dte_size;
17055
17056                 nrecs = epdesc.dtepd_nrecs;
17057                 epdesc.dtepd_nrecs = 0;
17058                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17059                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17060                                 continue;
17061
17062                         epdesc.dtepd_nrecs++;
17063                 }
17064
17065                 /*
17066                  * Now that we have the size, we need to allocate a temporary
17067                  * buffer in which to store the complete description.  We need
17068                  * the temporary buffer to be able to drop dtrace_lock()
17069                  * across the copyout(), below.
17070                  */
17071                 size = sizeof (dtrace_eprobedesc_t) +
17072                     (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17073
17074                 buf = kmem_alloc(size, KM_SLEEP);
17075                 dest = (uintptr_t)buf;
17076
17077                 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17078                 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17079
17080                 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17081                         if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17082                                 continue;
17083
17084                         if (nrecs-- == 0)
17085                                 break;
17086
17087                         bcopy(&act->dta_rec, (void *)dest,
17088                             sizeof (dtrace_recdesc_t));
17089                         dest += sizeof (dtrace_recdesc_t);
17090                 }
17091
17092                 mutex_exit(&dtrace_lock);
17093
17094                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17095                         kmem_free(buf, size);
17096                         return (EFAULT);
17097                 }
17098
17099                 kmem_free(buf, size);
17100                 return (0);
17101         }
17102
17103         case DTRACEIOC_AGGDESC: {
17104                 dtrace_aggdesc_t aggdesc;
17105                 dtrace_action_t *act;
17106                 dtrace_aggregation_t *agg;
17107                 int nrecs;
17108                 uint32_t offs;
17109                 dtrace_recdesc_t *lrec;
17110                 void *buf;
17111                 size_t size;
17112                 uintptr_t dest;
17113
17114                 if (copyin((void *)arg, &aggdesc, sizeof (aggdesc)) != 0)
17115                         return (EFAULT);
17116
17117                 mutex_enter(&dtrace_lock);
17118
17119                 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17120                         mutex_exit(&dtrace_lock);
17121                         return (EINVAL);
17122                 }
17123
17124                 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17125
17126                 nrecs = aggdesc.dtagd_nrecs;
17127                 aggdesc.dtagd_nrecs = 0;
17128
17129                 offs = agg->dtag_base;
17130                 lrec = &agg->dtag_action.dta_rec;
17131                 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17132
17133                 for (act = agg->dtag_first; ; act = act->dta_next) {
17134                         ASSERT(act->dta_intuple ||
17135                             DTRACEACT_ISAGG(act->dta_kind));
17136
17137                         /*
17138                          * If this action has a record size of zero, it
17139                          * denotes an argument to the aggregating action.
17140                          * Because the presence of this record doesn't (or
17141                          * shouldn't) affect the way the data is interpreted,
17142                          * we don't copy it out to save user-level the
17143                          * confusion of dealing with a zero-length record.
17144                          */
17145                         if (act->dta_rec.dtrd_size == 0) {
17146                                 ASSERT(agg->dtag_hasarg);
17147                                 continue;
17148                         }
17149
17150                         aggdesc.dtagd_nrecs++;
17151
17152                         if (act == &agg->dtag_action)
17153                                 break;
17154                 }
17155
17156                 /*
17157                  * Now that we have the size, we need to allocate a temporary
17158                  * buffer in which to store the complete description.  We need
17159                  * the temporary buffer to be able to drop dtrace_lock()
17160                  * across the copyout(), below.
17161                  */
17162                 size = sizeof (dtrace_aggdesc_t) +
17163                     (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17164
17165                 buf = kmem_alloc(size, KM_SLEEP);
17166                 dest = (uintptr_t)buf;
17167
17168                 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17169                 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17170
17171                 for (act = agg->dtag_first; ; act = act->dta_next) {
17172                         dtrace_recdesc_t rec = act->dta_rec;
17173
17174                         /*
17175                          * See the comment in the above loop for why we pass
17176                          * over zero-length records.
17177                          */
17178                         if (rec.dtrd_size == 0) {
17179                                 ASSERT(agg->dtag_hasarg);
17180                                 continue;
17181                         }
17182
17183                         if (nrecs-- == 0)
17184                                 break;
17185
17186                         rec.dtrd_offset -= offs;
17187                         bcopy(&rec, (void *)dest, sizeof (rec));
17188                         dest += sizeof (dtrace_recdesc_t);
17189
17190                         if (act == &agg->dtag_action)
17191                                 break;
17192                 }
17193
17194                 mutex_exit(&dtrace_lock);
17195
17196                 if (copyout(buf, (void *)arg, dest - (uintptr_t)buf) != 0) {
17197                         kmem_free(buf, size);
17198                         return (EFAULT);
17199                 }
17200
17201                 kmem_free(buf, size);
17202                 return (0);
17203         }
17204
17205         case DTRACEIOC_ENABLE: {
17206                 dof_hdr_t *dof;
17207                 dtrace_enabling_t *enab = NULL;
17208                 dtrace_vstate_t *vstate;
17209                 int err = 0;
17210
17211                 *rv = 0;
17212
17213                 /*
17214                  * If a NULL argument has been passed, we take this as our
17215                  * cue to reevaluate our enablings.
17216                  */
17217                 if (arg == NULL) {
17218                         dtrace_enabling_matchall();
17219
17220                         return (0);
17221                 }
17222
17223                 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
17224                         return (rval);
17225
17226                 mutex_enter(&cpu_lock);
17227                 mutex_enter(&dtrace_lock);
17228                 vstate = &state->dts_vstate;
17229
17230                 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
17231                         mutex_exit(&dtrace_lock);
17232                         mutex_exit(&cpu_lock);
17233                         dtrace_dof_destroy(dof);
17234                         return (EBUSY);
17235                 }
17236
17237                 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
17238                         mutex_exit(&dtrace_lock);
17239                         mutex_exit(&cpu_lock);
17240                         dtrace_dof_destroy(dof);
17241                         return (EINVAL);
17242                 }
17243
17244                 if ((rval = dtrace_dof_options(dof, state)) != 0) {
17245                         dtrace_enabling_destroy(enab);
17246                         mutex_exit(&dtrace_lock);
17247                         mutex_exit(&cpu_lock);
17248                         dtrace_dof_destroy(dof);
17249                         return (rval);
17250                 }
17251
17252                 if ((err = dtrace_enabling_match(enab, rv)) == 0) {
17253                         err = dtrace_enabling_retain(enab);
17254                 } else {
17255                         dtrace_enabling_destroy(enab);
17256                 }
17257
17258                 mutex_exit(&cpu_lock);
17259                 mutex_exit(&dtrace_lock);
17260                 dtrace_dof_destroy(dof);
17261
17262                 return (err);
17263         }
17264
17265         case DTRACEIOC_REPLICATE: {
17266                 dtrace_repldesc_t desc;
17267                 dtrace_probedesc_t *match = &desc.dtrpd_match;
17268                 dtrace_probedesc_t *create = &desc.dtrpd_create;
17269                 int err;
17270
17271                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17272                         return (EFAULT);
17273
17274                 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17275                 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17276                 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17277                 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17278
17279                 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17280                 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17281                 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17282                 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17283
17284                 mutex_enter(&dtrace_lock);
17285                 err = dtrace_enabling_replicate(state, match, create);
17286                 mutex_exit(&dtrace_lock);
17287
17288                 return (err);
17289         }
17290
17291         case DTRACEIOC_PROBEMATCH:
17292         case DTRACEIOC_PROBES: {
17293                 dtrace_probe_t *probe = NULL;
17294                 dtrace_probedesc_t desc;
17295                 dtrace_probekey_t pkey;
17296                 dtrace_id_t i;
17297                 int m = 0;
17298                 uint32_t priv;
17299                 uid_t uid;
17300                 zoneid_t zoneid;
17301
17302                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17303                         return (EFAULT);
17304
17305                 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
17306                 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
17307                 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
17308                 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
17309
17310                 /*
17311                  * Before we attempt to match this probe, we want to give
17312                  * all providers the opportunity to provide it.
17313                  */
17314                 if (desc.dtpd_id == DTRACE_IDNONE) {
17315                         mutex_enter(&dtrace_provider_lock);
17316                         dtrace_probe_provide(&desc, NULL);
17317                         mutex_exit(&dtrace_provider_lock);
17318                         desc.dtpd_id++;
17319                 }
17320
17321                 if (cmd == DTRACEIOC_PROBEMATCH)  {
17322                         dtrace_probekey(&desc, &pkey);
17323                         pkey.dtpk_id = DTRACE_IDNONE;
17324                 }
17325
17326                 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
17327
17328                 mutex_enter(&dtrace_lock);
17329
17330                 if (cmd == DTRACEIOC_PROBEMATCH) {
17331                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17332                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17333                                     (m = dtrace_match_probe(probe, &pkey,
17334                                     priv, uid, zoneid)) != 0)
17335                                         break;
17336                         }
17337
17338                         if (m < 0) {
17339                                 mutex_exit(&dtrace_lock);
17340                                 return (EINVAL);
17341                         }
17342
17343                 } else {
17344                         for (i = desc.dtpd_id; i <= dtrace_nprobes; i++) {
17345                                 if ((probe = dtrace_probes[i - 1]) != NULL &&
17346                                     dtrace_match_priv(probe, priv, uid, zoneid))
17347                                         break;
17348                         }
17349                 }
17350
17351                 if (probe == NULL) {
17352                         mutex_exit(&dtrace_lock);
17353                         return (ESRCH);
17354                 }
17355
17356                 dtrace_probe_description(probe, &desc);
17357                 mutex_exit(&dtrace_lock);
17358
17359                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17360                         return (EFAULT);
17361
17362                 return (0);
17363         }
17364
17365         case DTRACEIOC_PROBEARG: {
17366                 dtrace_argdesc_t desc;
17367                 dtrace_probe_t *probe;
17368                 dtrace_provider_t *prov;
17369
17370                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17371                         return (EFAULT);
17372
17373                 if (desc.dtargd_id == DTRACE_IDNONE)
17374                         return (EINVAL);
17375
17376                 if (desc.dtargd_ndx == DTRACE_ARGNONE)
17377                         return (EINVAL);
17378
17379                 mutex_enter(&dtrace_provider_lock);
17380                 mutex_enter(&mod_lock);
17381                 mutex_enter(&dtrace_lock);
17382
17383                 if (desc.dtargd_id > dtrace_nprobes) {
17384                         mutex_exit(&dtrace_lock);
17385                         mutex_exit(&mod_lock);
17386                         mutex_exit(&dtrace_provider_lock);
17387                         return (EINVAL);
17388                 }
17389
17390                 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
17391                         mutex_exit(&dtrace_lock);
17392                         mutex_exit(&mod_lock);
17393                         mutex_exit(&dtrace_provider_lock);
17394                         return (EINVAL);
17395                 }
17396
17397                 mutex_exit(&dtrace_lock);
17398
17399                 prov = probe->dtpr_provider;
17400
17401                 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
17402                         /*
17403                          * There isn't any typed information for this probe.
17404                          * Set the argument number to DTRACE_ARGNONE.
17405                          */
17406                         desc.dtargd_ndx = DTRACE_ARGNONE;
17407                 } else {
17408                         desc.dtargd_native[0] = '\0';
17409                         desc.dtargd_xlate[0] = '\0';
17410                         desc.dtargd_mapping = desc.dtargd_ndx;
17411
17412                         prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
17413                             probe->dtpr_id, probe->dtpr_arg, &desc);
17414                 }
17415
17416                 mutex_exit(&mod_lock);
17417                 mutex_exit(&dtrace_provider_lock);
17418
17419                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17420                         return (EFAULT);
17421
17422                 return (0);
17423         }
17424
17425         case DTRACEIOC_GO: {
17426                 processorid_t cpuid;
17427                 rval = dtrace_state_go(state, &cpuid);
17428
17429                 if (rval != 0)
17430                         return (rval);
17431
17432                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17433                         return (EFAULT);
17434
17435                 return (0);
17436         }
17437
17438         case DTRACEIOC_STOP: {
17439                 processorid_t cpuid;
17440
17441                 mutex_enter(&dtrace_lock);
17442                 rval = dtrace_state_stop(state, &cpuid);
17443                 mutex_exit(&dtrace_lock);
17444
17445                 if (rval != 0)
17446                         return (rval);
17447
17448                 if (copyout(&cpuid, (void *)arg, sizeof (cpuid)) != 0)
17449                         return (EFAULT);
17450
17451                 return (0);
17452         }
17453
17454         case DTRACEIOC_DOFGET: {
17455                 dof_hdr_t hdr, *dof;
17456                 uint64_t len;
17457
17458                 if (copyin((void *)arg, &hdr, sizeof (hdr)) != 0)
17459                         return (EFAULT);
17460
17461                 mutex_enter(&dtrace_lock);
17462                 dof = dtrace_dof_create(state);
17463                 mutex_exit(&dtrace_lock);
17464
17465                 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
17466                 rval = copyout(dof, (void *)arg, len);
17467                 dtrace_dof_destroy(dof);
17468
17469                 return (rval == 0 ? 0 : EFAULT);
17470         }
17471
17472         case DTRACEIOC_AGGSNAP:
17473         case DTRACEIOC_BUFSNAP: {
17474                 dtrace_bufdesc_t desc;
17475                 caddr_t cached;
17476                 dtrace_buffer_t *buf;
17477
17478                 if (copyin((void *)arg, &desc, sizeof (desc)) != 0)
17479                         return (EFAULT);
17480
17481                 if (desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
17482                         return (EINVAL);
17483
17484                 mutex_enter(&dtrace_lock);
17485
17486                 if (cmd == DTRACEIOC_BUFSNAP) {
17487                         buf = &state->dts_buffer[desc.dtbd_cpu];
17488                 } else {
17489                         buf = &state->dts_aggbuffer[desc.dtbd_cpu];
17490                 }
17491
17492                 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
17493                         size_t sz = buf->dtb_offset;
17494
17495                         if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
17496                                 mutex_exit(&dtrace_lock);
17497                                 return (EBUSY);
17498                         }
17499
17500                         /*
17501                          * If this buffer has already been consumed, we're
17502                          * going to indicate that there's nothing left here
17503                          * to consume.
17504                          */
17505                         if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
17506                                 mutex_exit(&dtrace_lock);
17507
17508                                 desc.dtbd_size = 0;
17509                                 desc.dtbd_drops = 0;
17510                                 desc.dtbd_errors = 0;
17511                                 desc.dtbd_oldest = 0;
17512                                 sz = sizeof (desc);
17513
17514                                 if (copyout(&desc, (void *)arg, sz) != 0)
17515                                         return (EFAULT);
17516
17517                                 return (0);
17518                         }
17519
17520                         /*
17521                          * If this is a ring buffer that has wrapped, we want
17522                          * to copy the whole thing out.
17523                          */
17524                         if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
17525                                 dtrace_buffer_polish(buf);
17526                                 sz = buf->dtb_size;
17527                         }
17528
17529                         if (copyout(buf->dtb_tomax, desc.dtbd_data, sz) != 0) {
17530                                 mutex_exit(&dtrace_lock);
17531                                 return (EFAULT);
17532                         }
17533
17534                         desc.dtbd_size = sz;
17535                         desc.dtbd_drops = buf->dtb_drops;
17536                         desc.dtbd_errors = buf->dtb_errors;
17537                         desc.dtbd_oldest = buf->dtb_xamot_offset;
17538                         desc.dtbd_timestamp = dtrace_gethrtime();
17539
17540                         mutex_exit(&dtrace_lock);
17541
17542                         if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17543                                 return (EFAULT);
17544
17545                         buf->dtb_flags |= DTRACEBUF_CONSUMED;
17546
17547                         return (0);
17548                 }
17549
17550                 if (buf->dtb_tomax == NULL) {
17551                         ASSERT(buf->dtb_xamot == NULL);
17552                         mutex_exit(&dtrace_lock);
17553                         return (ENOENT);
17554                 }
17555
17556                 cached = buf->dtb_tomax;
17557                 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
17558
17559                 dtrace_xcall(desc.dtbd_cpu,
17560                     (dtrace_xcall_t)dtrace_buffer_switch, buf);
17561
17562                 state->dts_errors += buf->dtb_xamot_errors;
17563
17564                 /*
17565                  * If the buffers did not actually switch, then the cross call
17566                  * did not take place -- presumably because the given CPU is
17567                  * not in the ready set.  If this is the case, we'll return
17568                  * ENOENT.
17569                  */
17570                 if (buf->dtb_tomax == cached) {
17571                         ASSERT(buf->dtb_xamot != cached);
17572                         mutex_exit(&dtrace_lock);
17573                         return (ENOENT);
17574                 }
17575
17576                 ASSERT(cached == buf->dtb_xamot);
17577
17578                 /*
17579                  * We have our snapshot; now copy it out.
17580                  */
17581                 if (copyout(buf->dtb_xamot, desc.dtbd_data,
17582                     buf->dtb_xamot_offset) != 0) {
17583                         mutex_exit(&dtrace_lock);
17584                         return (EFAULT);
17585                 }
17586
17587                 desc.dtbd_size = buf->dtb_xamot_offset;
17588                 desc.dtbd_drops = buf->dtb_xamot_drops;
17589                 desc.dtbd_errors = buf->dtb_xamot_errors;
17590                 desc.dtbd_oldest = 0;
17591                 desc.dtbd_timestamp = buf->dtb_switched;
17592
17593                 mutex_exit(&dtrace_lock);
17594
17595                 /*
17596                  * Finally, copy out the buffer description.
17597                  */
17598                 if (copyout(&desc, (void *)arg, sizeof (desc)) != 0)
17599                         return (EFAULT);
17600
17601                 return (0);
17602         }
17603
17604         case DTRACEIOC_CONF: {
17605                 dtrace_conf_t conf;
17606
17607                 bzero(&conf, sizeof (conf));
17608                 conf.dtc_difversion = DIF_VERSION;
17609                 conf.dtc_difintregs = DIF_DIR_NREGS;
17610                 conf.dtc_diftupregs = DIF_DTR_NREGS;
17611                 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
17612
17613                 if (copyout(&conf, (void *)arg, sizeof (conf)) != 0)
17614                         return (EFAULT);
17615
17616                 return (0);
17617         }
17618
17619         case DTRACEIOC_STATUS: {
17620                 dtrace_status_t stat;
17621                 dtrace_dstate_t *dstate;
17622                 int i, j;
17623                 uint64_t nerrs;
17624
17625                 /*
17626                  * See the comment in dtrace_state_deadman() for the reason
17627                  * for setting dts_laststatus to INT64_MAX before setting
17628                  * it to the correct value.
17629                  */
17630                 state->dts_laststatus = INT64_MAX;
17631                 dtrace_membar_producer();
17632                 state->dts_laststatus = dtrace_gethrtime();
17633
17634                 bzero(&stat, sizeof (stat));
17635
17636                 mutex_enter(&dtrace_lock);
17637
17638                 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
17639                         mutex_exit(&dtrace_lock);
17640                         return (ENOENT);
17641                 }
17642
17643                 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
17644                         stat.dtst_exiting = 1;
17645
17646                 nerrs = state->dts_errors;
17647                 dstate = &state->dts_vstate.dtvs_dynvars;
17648
17649                 for (i = 0; i < NCPU; i++) {
17650                         dtrace_dstate_percpu_t *dcpu = &dstate->dtds_percpu[i];
17651
17652                         stat.dtst_dyndrops += dcpu->dtdsc_drops;
17653                         stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
17654                         stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
17655
17656                         if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
17657                                 stat.dtst_filled++;
17658
17659                         nerrs += state->dts_buffer[i].dtb_errors;
17660
17661                         for (j = 0; j < state->dts_nspeculations; j++) {
17662                                 dtrace_speculation_t *spec;
17663                                 dtrace_buffer_t *buf;
17664
17665                                 spec = &state->dts_speculations[j];
17666                                 buf = &spec->dtsp_buffer[i];
17667                                 stat.dtst_specdrops += buf->dtb_xamot_drops;
17668                         }
17669                 }
17670
17671                 stat.dtst_specdrops_busy = state->dts_speculations_busy;
17672                 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
17673                 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
17674                 stat.dtst_dblerrors = state->dts_dblerrors;
17675                 stat.dtst_killed =
17676                     (state->dts_activity == DTRACE_ACTIVITY_KILLED);
17677                 stat.dtst_errors = nerrs;
17678
17679                 mutex_exit(&dtrace_lock);
17680
17681                 if (copyout(&stat, (void *)arg, sizeof (stat)) != 0)
17682                         return (EFAULT);
17683
17684                 return (0);
17685         }
17686
17687         case DTRACEIOC_FORMAT: {
17688                 dtrace_fmtdesc_t fmt;
17689                 char *str;
17690                 int len;
17691
17692                 if (copyin((void *)arg, &fmt, sizeof (fmt)) != 0)
17693                         return (EFAULT);
17694
17695                 mutex_enter(&dtrace_lock);
17696
17697                 if (fmt.dtfd_format == 0 ||
17698                     fmt.dtfd_format > state->dts_nformats) {
17699                         mutex_exit(&dtrace_lock);
17700                         return (EINVAL);
17701                 }
17702
17703                 /*
17704                  * Format strings are allocated contiguously and they are
17705                  * never freed; if a format index is less than the number
17706                  * of formats, we can assert that the format map is non-NULL
17707                  * and that the format for the specified index is non-NULL.
17708                  */
17709                 ASSERT(state->dts_formats != NULL);
17710                 str = state->dts_formats[fmt.dtfd_format - 1];
17711                 ASSERT(str != NULL);
17712
17713                 len = strlen(str) + 1;
17714
17715                 if (len > fmt.dtfd_length) {
17716                         fmt.dtfd_length = len;
17717
17718                         if (copyout(&fmt, (void *)arg, sizeof (fmt)) != 0) {
17719                                 mutex_exit(&dtrace_lock);
17720                                 return (EINVAL);
17721                         }
17722                 } else {
17723                         if (copyout(str, fmt.dtfd_string, len) != 0) {
17724                                 mutex_exit(&dtrace_lock);
17725                                 return (EINVAL);
17726                         }
17727                 }
17728
17729                 mutex_exit(&dtrace_lock);
17730                 return (0);
17731         }
17732
17733         default:
17734                 break;
17735         }
17736
17737         return (ENOTTY);
17738 }
17739
17740 /*ARGSUSED*/
17741 static int
17742 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
17743 {
17744         dtrace_state_t *state;
17745
17746         switch (cmd) {
17747         case DDI_DETACH:
17748                 break;
17749
17750         case DDI_SUSPEND:
17751                 return (DDI_SUCCESS);
17752
17753         default:
17754                 return (DDI_FAILURE);
17755         }
17756
17757         mutex_enter(&cpu_lock);
17758         mutex_enter(&dtrace_provider_lock);
17759         mutex_enter(&dtrace_lock);
17760
17761         ASSERT(dtrace_opens == 0);
17762
17763         if (dtrace_helpers > 0) {
17764                 mutex_exit(&dtrace_provider_lock);
17765                 mutex_exit(&dtrace_lock);
17766                 mutex_exit(&cpu_lock);
17767                 return (DDI_FAILURE);
17768         }
17769
17770         if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
17771                 mutex_exit(&dtrace_provider_lock);
17772                 mutex_exit(&dtrace_lock);
17773                 mutex_exit(&cpu_lock);
17774                 return (DDI_FAILURE);
17775         }
17776
17777         dtrace_provider = NULL;
17778
17779         if ((state = dtrace_anon_grab()) != NULL) {
17780                 /*
17781                  * If there were ECBs on this state, the provider should
17782                  * have not been allowed to detach; assert that there is
17783                  * none.
17784                  */
17785                 ASSERT(state->dts_necbs == 0);
17786                 dtrace_state_destroy(state);
17787
17788                 /*
17789                  * If we're being detached with anonymous state, we need to
17790                  * indicate to the kernel debugger that DTrace is now inactive.
17791                  */
17792                 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17793         }
17794
17795         bzero(&dtrace_anon, sizeof (dtrace_anon_t));
17796         unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17797         dtrace_cpu_init = NULL;
17798         dtrace_helpers_cleanup = NULL;
17799         dtrace_helpers_fork = NULL;
17800         dtrace_cpustart_init = NULL;
17801         dtrace_cpustart_fini = NULL;
17802         dtrace_debugger_init = NULL;
17803         dtrace_debugger_fini = NULL;
17804         dtrace_modload = NULL;
17805         dtrace_modunload = NULL;
17806
17807         ASSERT(dtrace_getf == 0);
17808         ASSERT(dtrace_closef == NULL);
17809
17810         mutex_exit(&cpu_lock);
17811
17812         kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
17813         dtrace_probes = NULL;
17814         dtrace_nprobes = 0;
17815
17816         dtrace_hash_destroy(dtrace_bymod);
17817         dtrace_hash_destroy(dtrace_byfunc);
17818         dtrace_hash_destroy(dtrace_byname);
17819         dtrace_bymod = NULL;
17820         dtrace_byfunc = NULL;
17821         dtrace_byname = NULL;
17822
17823         kmem_cache_destroy(dtrace_state_cache);
17824         vmem_destroy(dtrace_minor);
17825         vmem_destroy(dtrace_arena);
17826
17827         if (dtrace_toxrange != NULL) {
17828                 kmem_free(dtrace_toxrange,
17829                     dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
17830                 dtrace_toxrange = NULL;
17831                 dtrace_toxranges = 0;
17832                 dtrace_toxranges_max = 0;
17833         }
17834
17835         ddi_remove_minor_node(dtrace_devi, NULL);
17836         dtrace_devi = NULL;
17837
17838         ddi_soft_state_fini(&dtrace_softstate);
17839
17840         ASSERT(dtrace_vtime_references == 0);
17841         ASSERT(dtrace_opens == 0);
17842         ASSERT(dtrace_retained == NULL);
17843
17844         mutex_exit(&dtrace_lock);
17845         mutex_exit(&dtrace_provider_lock);
17846
17847         /*
17848          * We don't destroy the task queue until after we have dropped our
17849          * locks (taskq_destroy() may block on running tasks).  To prevent
17850          * attempting to do work after we have effectively detached but before
17851          * the task queue has been destroyed, all tasks dispatched via the
17852          * task queue must check that DTrace is still attached before
17853          * performing any operation.
17854          */
17855         taskq_destroy(dtrace_taskq);
17856         dtrace_taskq = NULL;
17857
17858         return (DDI_SUCCESS);
17859 }
17860 #endif
17861
17862 #if defined(sun)
17863 /*ARGSUSED*/
17864 static int
17865 dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
17866 {
17867         int error;
17868
17869         switch (infocmd) {
17870         case DDI_INFO_DEVT2DEVINFO:
17871                 *result = (void *)dtrace_devi;
17872                 error = DDI_SUCCESS;
17873                 break;
17874         case DDI_INFO_DEVT2INSTANCE:
17875                 *result = (void *)0;
17876                 error = DDI_SUCCESS;
17877                 break;
17878         default:
17879                 error = DDI_FAILURE;
17880         }
17881         return (error);
17882 }
17883 #endif
17884
17885 #if defined(sun)
17886 static struct cb_ops dtrace_cb_ops = {
17887         dtrace_open,            /* open */
17888         dtrace_close,           /* close */
17889         nulldev,                /* strategy */
17890         nulldev,                /* print */
17891         nodev,                  /* dump */
17892         nodev,                  /* read */
17893         nodev,                  /* write */
17894         dtrace_ioctl,           /* ioctl */
17895         nodev,                  /* devmap */
17896         nodev,                  /* mmap */
17897         nodev,                  /* segmap */
17898         nochpoll,               /* poll */
17899         ddi_prop_op,            /* cb_prop_op */
17900         0,                      /* streamtab  */
17901         D_NEW | D_MP            /* Driver compatibility flag */
17902 };
17903
17904 static struct dev_ops dtrace_ops = {
17905         DEVO_REV,               /* devo_rev */
17906         0,                      /* refcnt */
17907         dtrace_info,            /* get_dev_info */
17908         nulldev,                /* identify */
17909         nulldev,                /* probe */
17910         dtrace_attach,          /* attach */
17911         dtrace_detach,          /* detach */
17912         nodev,                  /* reset */
17913         &dtrace_cb_ops,         /* driver operations */
17914         NULL,                   /* bus operations */
17915         nodev                   /* dev power */
17916 };
17917
17918 static struct modldrv modldrv = {
17919         &mod_driverops,         /* module type (this is a pseudo driver) */
17920         "Dynamic Tracing",      /* name of module */
17921         &dtrace_ops,            /* driver ops */
17922 };
17923
17924 static struct modlinkage modlinkage = {
17925         MODREV_1,
17926         (void *)&modldrv,
17927         NULL
17928 };
17929
17930 int
17931 _init(void)
17932 {
17933         return (mod_install(&modlinkage));
17934 }
17935
17936 int
17937 _info(struct modinfo *modinfop)
17938 {
17939         return (mod_info(&modlinkage, modinfop));
17940 }
17941
17942 int
17943 _fini(void)
17944 {
17945         return (mod_remove(&modlinkage));
17946 }
17947 #else
17948
17949 static d_ioctl_t        dtrace_ioctl;
17950 static d_ioctl_t        dtrace_ioctl_helper;
17951 static void             dtrace_load(void *);
17952 static int              dtrace_unload(void);
17953 static struct cdev      *dtrace_dev;
17954 static struct cdev      *helper_dev;
17955
17956 void dtrace_invop_init(void);
17957 void dtrace_invop_uninit(void);
17958
17959 static struct cdevsw dtrace_cdevsw = {
17960         .d_version      = D_VERSION,
17961         .d_ioctl        = dtrace_ioctl,
17962         .d_open         = dtrace_open,
17963         .d_name         = "dtrace",
17964 };
17965
17966 static struct cdevsw helper_cdevsw = {
17967         .d_version      = D_VERSION,
17968         .d_ioctl        = dtrace_ioctl_helper,
17969         .d_name         = "helper",
17970 };
17971
17972 #include <dtrace_anon.c>
17973 #include <dtrace_ioctl.c>
17974 #include <dtrace_load.c>
17975 #include <dtrace_modevent.c>
17976 #include <dtrace_sysctl.c>
17977 #include <dtrace_unload.c>
17978 #include <dtrace_vtime.c>
17979 #include <dtrace_hacks.c>
17980 #include <dtrace_isa.c>
17981
17982 SYSINIT(dtrace_load, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_load, NULL);
17983 SYSUNINIT(dtrace_unload, SI_SUB_DTRACE, SI_ORDER_FIRST, dtrace_unload, NULL);
17984 SYSINIT(dtrace_anon_init, SI_SUB_DTRACE_ANON, SI_ORDER_FIRST, dtrace_anon_init, NULL);
17985
17986 DEV_MODULE(dtrace, dtrace_modevent, NULL);
17987 MODULE_VERSION(dtrace, 1);
17988 MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
17989 #endif