]> CyberLeo.Net >> Repos - FreeBSD/releng/10.2.git/blob - cddl/contrib/opensolaris/cmd/lockstat/lockstat.c
- Copy stable/10@285827 to releng/10.2 in preparation for 10.2-RC1
[FreeBSD/releng/10.2.git] / cddl / contrib / opensolaris / cmd / lockstat / lockstat.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #pragma ident   "%Z%%M% %I%     %E% SMI"
27
28 #include <stdio.h>
29 #include <stddef.h>
30 #include <stdlib.h>
31 #include <stdarg.h>
32 #include <string.h>
33 #include <strings.h>
34 #include <ctype.h>
35 #include <fcntl.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <sys/types.h>
40 #include <sys/modctl.h>
41 #include <sys/stat.h>
42 #include <sys/wait.h>
43 #include <dtrace.h>
44 #include <sys/lockstat.h>
45 #include <alloca.h>
46 #include <signal.h>
47 #include <assert.h>
48
49 #if defined(sun)
50 #define GETOPT_EOF      EOF
51 #else
52 /* FreeBSD */ 
53 #include <sys/time.h>
54 #include <sys/resource.h>
55
56 #define mergesort(a, b, c, d)   lsmergesort(a, b, c, d)
57 #define GETOPT_EOF              (-1)
58
59 typedef uintptr_t       pc_t;
60 #endif /* defined(sun) */
61
62 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
63
64 #define LS_MAX_STACK_DEPTH      50
65 #define LS_MAX_EVENTS           64
66
67 typedef struct lsrec {
68         struct lsrec    *ls_next;       /* next in hash chain */
69         uintptr_t       ls_lock;        /* lock address */
70         uintptr_t       ls_caller;      /* caller address */
71         uint32_t        ls_count;       /* cumulative event count */
72         uint32_t        ls_event;       /* type of event */
73         uintptr_t       ls_refcnt;      /* cumulative reference count */
74         uint64_t        ls_time;        /* cumulative event duration */
75         uint32_t        ls_hist[64];    /* log2(duration) histogram */
76         uintptr_t       ls_stack[LS_MAX_STACK_DEPTH];
77 } lsrec_t;
78
79 typedef struct lsdata {
80         struct lsrec    *lsd_next;      /* next available */
81         int             lsd_count;      /* number of records */
82 } lsdata_t;
83
84 /*
85  * Definitions for the types of experiments which can be run.  They are
86  * listed in increasing order of memory cost and processing time cost.
87  * The numerical value of each type is the number of bytes needed per record.
88  */
89 #define LS_BASIC        offsetof(lsrec_t, ls_time)
90 #define LS_TIME         offsetof(lsrec_t, ls_hist[0])
91 #define LS_HIST         offsetof(lsrec_t, ls_stack[0])
92 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth])
93
94 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
95 static void report_trace(FILE *, lsrec_t **);
96
97 extern int symtab_init(void);
98 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
99 extern uintptr_t sym_to_addr(char *name);
100 extern size_t sym_size(char *name);
101 extern char *strtok_r(char *, const char *, char **);
102
103 #define DEFAULT_NRECS   10000
104 #define DEFAULT_HZ      97
105 #define MAX_HZ          1000
106 #define MIN_AGGSIZE     (16 * 1024)
107 #define MAX_AGGSIZE     (32 * 1024 * 1024)
108
109 static int g_stkdepth;
110 static int g_topn = INT_MAX;
111 static hrtime_t g_elapsed;
112 static int g_rates = 0;
113 static int g_pflag = 0;
114 static int g_Pflag = 0;
115 static int g_wflag = 0;
116 static int g_Wflag = 0;
117 static int g_cflag = 0;
118 static int g_kflag = 0;
119 static int g_gflag = 0;
120 static int g_Vflag = 0;
121 static int g_tracing = 0;
122 static size_t g_recsize;
123 static size_t g_nrecs;
124 static int g_nrecs_used;
125 static uchar_t g_enabled[LS_MAX_EVENTS];
126 static hrtime_t g_min_duration[LS_MAX_EVENTS];
127 static dtrace_hdl_t *g_dtp;
128 static char *g_predicate;
129 static char *g_ipredicate;
130 static char *g_prog;
131 static int g_proglen;
132 static int g_dropped;
133
134 typedef struct ls_event_info {
135         char    ev_type;
136         char    ev_lhdr[20];
137         char    ev_desc[80];
138         char    ev_units[10];
139         char    ev_name[DTRACE_NAMELEN];
140         char    *ev_predicate;
141         char    *ev_acquire;
142 } ls_event_info_t;
143
144 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
145         { 'C',  "Lock", "Adaptive mutex spin",                  "nsec",
146             "lockstat:::adaptive-spin" },
147         { 'C',  "Lock", "Adaptive mutex block",                 "nsec",
148             "lockstat:::adaptive-block" },
149         { 'C',  "Lock", "Spin lock spin",                       "nsec",
150             "lockstat:::spin-spin" },
151         { 'C',  "Lock", "Thread lock spin",                     "nsec",
152             "lockstat:::thread-spin" },
153         { 'C',  "Lock", "R/W writer blocked by writer",         "nsec",
154             "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
155         { 'C',  "Lock", "R/W writer blocked by readers",        "nsec",
156             "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
157         { 'C',  "Lock", "R/W reader blocked by writer",         "nsec",
158             "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
159         { 'C',  "Lock", "R/W reader blocked by write wanted",   "nsec",
160             "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
161         { 'C',  "Lock", "R/W writer spin on writer",            "nsec",
162             "lockstat:::rw-spin", "arg2 == 0 && arg3 == 1" },
163         { 'C',  "Lock", "R/W writer spin on readers",           "nsec",
164             "lockstat:::rw-spin", "arg2 == 0 && arg3 == 0 && arg4" },
165         { 'C',  "Lock", "R/W reader spin on writer",            "nsec",
166             "lockstat:::rw-spin", "arg2 != 0 && arg3 == 1" },
167         { 'C',  "Lock", "R/W reader spin on write wanted",      "nsec",
168             "lockstat:::rw-spin", "arg2 != 0 && arg3 == 0 && arg4" },
169         { 'C',  "Lock", "SX exclusive block",                   "nsec",
170             "lockstat:::sx-block", "arg2 == 0" },
171         { 'C',  "Lock", "SX shared block",                      "nsec",
172             "lockstat:::sx-block", "arg2 != 0" },
173         { 'C',  "Lock", "SX exclusive spin",                    "nsec",
174             "lockstat:::sx-spin", "arg2 == 0" },
175         { 'C',  "Lock", "SX shared spin",                       "nsec",
176             "lockstat:::sx-spin", "arg2 != 0" },
177         { 'C',  "Lock", "Unknown event (type 16)",              "units" },
178         { 'C',  "Lock", "Unknown event (type 17)",              "units" },
179         { 'C',  "Lock", "Unknown event (type 18)",              "units" },
180         { 'C',  "Lock", "Unknown event (type 19)",              "units" },
181         { 'C',  "Lock", "Unknown event (type 20)",              "units" },
182         { 'C',  "Lock", "Unknown event (type 21)",              "units" },
183         { 'C',  "Lock", "Unknown event (type 22)",              "units" },
184         { 'C',  "Lock", "Unknown event (type 23)",              "units" },
185         { 'C',  "Lock", "Unknown event (type 24)",              "units" },
186         { 'C',  "Lock", "Unknown event (type 25)",              "units" },
187         { 'C',  "Lock", "Unknown event (type 26)",              "units" },
188         { 'C',  "Lock", "Unknown event (type 27)",              "units" },
189         { 'C',  "Lock", "Unknown event (type 28)",              "units" },
190         { 'C',  "Lock", "Unknown event (type 29)",              "units" },
191         { 'C',  "Lock", "Unknown event (type 30)",              "units" },
192         { 'C',  "Lock", "Unknown event (type 31)",              "units" },
193         { 'H',  "Lock", "Adaptive mutex hold",                  "nsec",
194             "lockstat:::adaptive-release", NULL,
195             "lockstat:::adaptive-acquire" },
196         { 'H',  "Lock", "Spin lock hold",                       "nsec",
197             "lockstat:::spin-release", NULL,
198             "lockstat:::spin-acquire" },
199         { 'H',  "Lock", "R/W writer hold",                      "nsec",
200             "lockstat::rw_wunlock:rw-release", NULL,
201             "lockstat::rw_wlock:rw-acquire" },
202         { 'H',  "Lock", "R/W reader hold",                      "nsec",
203             "lockstat::rw_runlock:rw-release", NULL,
204             "lockstat::rw_rlock:rw-acquire" },
205         { 'H',  "Lock", "SX shared hold",                       "nsec",
206             "lockstat::sx_sunlock:sx-release", NULL,
207             "lockstat::sx_slock:sx-acquire" },
208         { 'H',  "Lock", "SX exclusive hold",                    "nsec",
209             "lockstat::sx_xunlock:sx-release", NULL,
210             "lockstat::sx_xlock:sx-acquire" },
211         { 'H',  "Lock", "Unknown event (type 38)",              "units" },
212         { 'H',  "Lock", "Unknown event (type 39)",              "units" },
213         { 'H',  "Lock", "Unknown event (type 40)",              "units" },
214         { 'H',  "Lock", "Unknown event (type 41)",              "units" },
215         { 'H',  "Lock", "Unknown event (type 42)",              "units" },
216         { 'H',  "Lock", "Unknown event (type 43)",              "units" },
217         { 'H',  "Lock", "Unknown event (type 44)",              "units" },
218         { 'H',  "Lock", "Unknown event (type 45)",              "units" },
219         { 'H',  "Lock", "Unknown event (type 46)",              "units" },
220         { 'H',  "Lock", "Unknown event (type 47)",              "units" },
221         { 'H',  "Lock", "Unknown event (type 48)",              "units" },
222         { 'H',  "Lock", "Unknown event (type 49)",              "units" },
223         { 'H',  "Lock", "Unknown event (type 50)",              "units" },
224         { 'H',  "Lock", "Unknown event (type 51)",              "units" },
225         { 'H',  "Lock", "Unknown event (type 52)",              "units" },
226         { 'H',  "Lock", "Unknown event (type 53)",              "units" },
227         { 'H',  "Lock", "Unknown event (type 54)",              "units" },
228         { 'H',  "Lock", "Unknown event (type 55)",              "units" },
229 #if defined(sun)
230         { 'I',  "CPU+PIL", "Profiling interrupt",               "nsec",
231 #else
232         /* FreeBSD */
233         { 'I',  "CPU+Pri_Class", "Profiling interrupt",         "nsec",
234 #endif
235             "profile:::profile-97", NULL },
236         { 'I',  "Lock", "Unknown event (type 57)",              "units" },
237         { 'I',  "Lock", "Unknown event (type 58)",              "units" },
238         { 'I',  "Lock", "Unknown event (type 59)",              "units" },
239         { 'E',  "Lock", "Recursive lock entry detected",        "(N/A)",
240             "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
241         { 'E',  "Lock", "Lockstat enter failure",               "(N/A)" },
242         { 'E',  "Lock", "Lockstat exit failure",                "nsec"  },
243         { 'E',  "Lock", "Lockstat record failure",              "(N/A)" },
244 };
245
246 #if !defined(sun)
247 static char *g_pri_class[] = {
248         "",
249         "Intr",
250         "RealT",
251         "TShar",
252         "Idle"
253 };
254 #endif
255
256 static void
257 fail(int do_perror, const char *message, ...)
258 {
259         va_list args;
260         int save_errno = errno;
261
262         va_start(args, message);
263         (void) fprintf(stderr, "lockstat: ");
264         (void) vfprintf(stderr, message, args);
265         va_end(args);
266         if (do_perror)
267                 (void) fprintf(stderr, ": %s", strerror(save_errno));
268         (void) fprintf(stderr, "\n");
269         exit(2);
270 }
271
272 static void
273 dfail(const char *message, ...)
274 {
275         va_list args;
276
277         va_start(args, message);
278         (void) fprintf(stderr, "lockstat: ");
279         (void) vfprintf(stderr, message, args);
280         va_end(args);
281         (void) fprintf(stderr, ": %s\n",
282             dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
283
284         exit(2);
285 }
286
287 static void
288 show_events(char event_type, char *desc)
289 {
290         int i, first = -1, last;
291
292         for (i = 0; i < LS_MAX_EVENTS; i++) {
293                 ls_event_info_t *evp = &g_event_info[i];
294                 if (evp->ev_type != event_type ||
295                     strncmp(evp->ev_desc, "Unknown event", 13) == 0)
296                         continue;
297                 if (first == -1)
298                         first = i;
299                 last = i;
300         }
301
302         (void) fprintf(stderr,
303             "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
304             desc, event_type, first, last);
305
306         for (i = first; i <= last; i++)
307                 (void) fprintf(stderr,
308                     "%4d = %s\n", i, g_event_info[i].ev_desc);
309 }
310
311 static void
312 usage(void)
313 {
314         (void) fprintf(stderr,
315             "Usage: lockstat [options] command [args]\n"
316             "\nEvent selection options:\n\n"
317             "  -C              watch contention events [on by default]\n"
318             "  -E              watch error events [off by default]\n"
319             "  -H              watch hold events [off by default]\n"
320             "  -I              watch interrupt events [off by default]\n"
321             "  -A              watch all lock events [equivalent to -CH]\n"
322             "  -e event_list   only watch the specified events (shown below);\n"
323             "                  <event_list> is a comma-separated list of\n"
324             "                  events or ranges of events, e.g. 1,4-7,35\n"
325             "  -i rate         interrupt rate for -I [default: %d Hz]\n"
326             "\nData gathering options:\n\n"
327             "  -b              basic statistics (lock, caller, event count)\n"
328             "  -t              timing for all events [default]\n"
329             "  -h              histograms for event times\n"
330             "  -s depth        stack traces <depth> deep\n"
331             "  -x opt[=val]    enable or modify DTrace options\n"
332             "\nData filtering options:\n\n"
333             "  -n nrecords     maximum number of data records [default: %d]\n"
334             "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
335             "                  symbolic name or hex address; <size> defaults\n"
336             "                  to the ELF symbol size if available, 1 if not\n"
337             "  -f func[,size]  only watch events generated by <func>\n"
338             "  -d duration     only watch events longer than <duration>\n"
339             "  -T              trace (rather than sample) events\n"
340             "\nData reporting options:\n\n"
341             "  -c              coalesce lock data for arrays like pse_mutex[]\n"
342             "  -k              coalesce PCs within functions\n"
343             "  -g              show total events generated by function\n"
344             "  -w              wherever: don't distinguish events by caller\n"
345             "  -W              whichever: don't distinguish events by lock\n"
346             "  -R              display rates rather than counts\n"
347             "  -p              parsable output format (awk(1)-friendly)\n"
348             "  -P              sort lock data by (count * avg_time) product\n"
349             "  -D n            only display top <n> events of each type\n"
350             "  -o filename     send output to <filename>\n",
351             DEFAULT_HZ, DEFAULT_NRECS);
352
353         show_events('C', "Contention");
354         show_events('H', "Hold-time");
355         show_events('I', "Interrupt");
356         show_events('E', "Error");
357         (void) fprintf(stderr, "\n");
358
359         exit(1);
360 }
361
362 static int
363 lockcmp(lsrec_t *a, lsrec_t *b)
364 {
365         int i;
366
367         if (a->ls_event < b->ls_event)
368                 return (-1);
369         if (a->ls_event > b->ls_event)
370                 return (1);
371
372         for (i = g_stkdepth - 1; i >= 0; i--) {
373                 if (a->ls_stack[i] < b->ls_stack[i])
374                         return (-1);
375                 if (a->ls_stack[i] > b->ls_stack[i])
376                         return (1);
377         }
378
379         if (a->ls_caller < b->ls_caller)
380                 return (-1);
381         if (a->ls_caller > b->ls_caller)
382                 return (1);
383
384         if (a->ls_lock < b->ls_lock)
385                 return (-1);
386         if (a->ls_lock > b->ls_lock)
387                 return (1);
388
389         return (0);
390 }
391
392 static int
393 countcmp(lsrec_t *a, lsrec_t *b)
394 {
395         if (a->ls_event < b->ls_event)
396                 return (-1);
397         if (a->ls_event > b->ls_event)
398                 return (1);
399
400         return (b->ls_count - a->ls_count);
401 }
402
403 static int
404 timecmp(lsrec_t *a, lsrec_t *b)
405 {
406         if (a->ls_event < b->ls_event)
407                 return (-1);
408         if (a->ls_event > b->ls_event)
409                 return (1);
410
411         if (a->ls_time < b->ls_time)
412                 return (1);
413         if (a->ls_time > b->ls_time)
414                 return (-1);
415
416         return (0);
417 }
418
419 static int
420 lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
421 {
422         if (a->ls_event < b->ls_event)
423                 return (-1);
424         if (a->ls_event > b->ls_event)
425                 return (1);
426
427         if (a->ls_lock < b->ls_lock)
428                 return (-1);
429         if (a->ls_lock > b->ls_lock)
430                 return (1);
431
432         return (0);
433 }
434
435 static int
436 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
437 {
438         if (a->ls_event < b->ls_event)
439                 return (-1);
440         if (a->ls_event > b->ls_event)
441                 return (1);
442
443         if (a->ls_lock < b->ls_lock)
444                 return (-1);
445         if (a->ls_lock > b->ls_lock)
446                 return (1);
447
448         return (b->ls_count - a->ls_count);
449 }
450
451 static int
452 sitecmp_anylock(lsrec_t *a, lsrec_t *b)
453 {
454         int i;
455
456         if (a->ls_event < b->ls_event)
457                 return (-1);
458         if (a->ls_event > b->ls_event)
459                 return (1);
460
461         for (i = g_stkdepth - 1; i >= 0; i--) {
462                 if (a->ls_stack[i] < b->ls_stack[i])
463                         return (-1);
464                 if (a->ls_stack[i] > b->ls_stack[i])
465                         return (1);
466         }
467
468         if (a->ls_caller < b->ls_caller)
469                 return (-1);
470         if (a->ls_caller > b->ls_caller)
471                 return (1);
472
473         return (0);
474 }
475
476 static int
477 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
478 {
479         int i;
480
481         if (a->ls_event < b->ls_event)
482                 return (-1);
483         if (a->ls_event > b->ls_event)
484                 return (1);
485
486         for (i = g_stkdepth - 1; i >= 0; i--) {
487                 if (a->ls_stack[i] < b->ls_stack[i])
488                         return (-1);
489                 if (a->ls_stack[i] > b->ls_stack[i])
490                         return (1);
491         }
492
493         if (a->ls_caller < b->ls_caller)
494                 return (-1);
495         if (a->ls_caller > b->ls_caller)
496                 return (1);
497
498         return (b->ls_count - a->ls_count);
499 }
500
501 static void
502 lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
503 {
504         int m = n / 2;
505         int i, j;
506
507         if (m > 1)
508                 lsmergesort(cmp, a, b, m);
509         if (n - m > 1)
510                 lsmergesort(cmp, a + m, b + m, n - m);
511         for (i = m; i > 0; i--)
512                 b[i - 1] = a[i - 1];
513         for (j = m - 1; j < n - 1; j++)
514                 b[n + m - j - 2] = a[j + 1];
515         while (i < j)
516                 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
517         *a = b[i];
518 }
519
520 static void
521 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
522 {
523         int i, j;
524         lsrec_t *target, *current;
525
526         target = lock[0];
527
528         for (i = 1; i < n; i++) {
529                 current = lock[i];
530                 if (cmp(current, target) != 0) {
531                         target = current;
532                         continue;
533                 }
534                 current->ls_event = LS_MAX_EVENTS;
535                 target->ls_count += current->ls_count;
536                 target->ls_refcnt += current->ls_refcnt;
537                 if (g_recsize < LS_TIME)
538                         continue;
539                 target->ls_time += current->ls_time;
540                 if (g_recsize < LS_HIST)
541                         continue;
542                 for (j = 0; j < 64; j++)
543                         target->ls_hist[j] += current->ls_hist[j];
544         }
545 }
546
547 static void
548 coalesce_symbol(uintptr_t *addrp)
549 {
550         uintptr_t symoff;
551         size_t symsize;
552
553         if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
554                 *addrp -= symoff;
555 }
556
557 static void
558 predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
559 {
560         char *new;
561         int len, newlen;
562
563         if (what == NULL)
564                 return;
565
566         if (*pred == NULL) {
567                 *pred = malloc(1);
568                 *pred[0] = '\0';
569         }
570
571         len = strlen(*pred);
572         newlen = len + strlen(what) + 32 + strlen("( && )");
573         new = malloc(newlen);
574
575         if (*pred[0] != '\0') {
576                 if (cmp != NULL) {
577                         (void) sprintf(new, "(%s) && (%s %s 0x%p)",
578                             *pred, what, cmp, (void *)value);
579                 } else {
580                         (void) sprintf(new, "(%s) && (%s)", *pred, what);
581                 }
582         } else {
583                 if (cmp != NULL) {
584                         (void) sprintf(new, "%s %s 0x%p",
585                             what, cmp, (void *)value);
586                 } else {
587                         (void) sprintf(new, "%s", what);
588                 }
589         }
590
591         free(*pred);
592         *pred = new;
593 }
594
595 static void
596 predicate_destroy(char **pred)
597 {
598         free(*pred);
599         *pred = NULL;
600 }
601
602 static void
603 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
604 {
605         char buf[256], *c = buf, *new;
606         int len, newlen;
607
608         if (*filt == NULL) {
609                 *filt = malloc(1);
610                 *filt[0] = '\0';
611         }
612
613 #if defined(sun)
614         (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
615             " || " : "", what, (void *)base, what, (void *)(base + size));
616 #else
617         (void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ?
618             " || " : "", what, (void *)base, what, (void *)(base + size));
619 #endif
620
621         newlen = (len = strlen(*filt) + 1) + strlen(c);
622         new = malloc(newlen);
623         bcopy(*filt, new, len);
624         (void) strcat(new, c);
625         free(*filt);
626         *filt = new;
627 }
628
629 static void
630 filter_destroy(char **filt)
631 {
632         free(*filt);
633         *filt = NULL;
634 }
635
636 static void
637 dprog_add(const char *fmt, ...)
638 {
639         va_list args;
640         int size, offs;
641         char c;
642
643         va_start(args, fmt);
644         size = vsnprintf(&c, 1, fmt, args) + 1;
645         va_end(args);
646
647         if (g_proglen == 0) {
648                 offs = 0;
649         } else {
650                 offs = g_proglen - 1;
651         }
652
653         g_proglen = offs + size;
654
655         if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
656                 fail(1, "failed to reallocate program text");
657
658         va_start(args, fmt);
659         (void) vsnprintf(&g_prog[offs], size, fmt, args);
660         va_end(args);
661 }
662
663 /*
664  * This function may read like an open sewer, but keep in mind that programs
665  * that generate other programs are rarely pretty.  If one has the unenviable
666  * task of maintaining or -- worse -- extending this code, use the -V option
667  * to examine the D program as generated by this function.
668  */
669 static void
670 dprog_addevent(int event)
671 {
672         ls_event_info_t *info = &g_event_info[event];
673         char *pred = NULL;
674         char stack[20];
675         const char *arg0, *caller;
676         char *arg1 = "arg1";
677         char buf[80];
678         hrtime_t dur;
679         int depth;
680
681         if (info->ev_name[0] == '\0')
682                 return;
683
684         if (info->ev_type == 'I') {
685                 /*
686                  * For interrupt events, arg0 (normally the lock pointer) is
687                  * the CPU address plus the current pil, and arg1 (normally
688                  * the number of nanoseconds) is the number of nanoseconds
689                  * late -- and it's stored in arg2.
690                  */
691 #if defined(sun)
692                 arg0 = "(uintptr_t)curthread->t_cpu + \n"
693                     "\t    curthread->t_cpu->cpu_profile_pil";
694 #else
695                 arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n"
696                     "\t    0x01000000 + curthread->td_pri_class";
697 #endif
698                 caller = "(uintptr_t)arg0";
699                 arg1 = "arg2";
700         } else {
701                 arg0 = "(uintptr_t)arg0";
702                 caller = "caller";
703         }
704
705         if (g_recsize > LS_HIST) {
706                 for (depth = 0; g_recsize > LS_STACK(depth); depth++)
707                         continue;
708
709                 if (g_tracing) {
710                         (void) sprintf(stack, "\tstack(%d);\n", depth);
711                 } else {
712                         (void) sprintf(stack, ", stack(%d)", depth);
713                 }
714         } else {
715                 (void) sprintf(stack, "");
716         }
717
718         if (info->ev_acquire != NULL) {
719                 /*
720                  * If this is a hold event, we need to generate an additional
721                  * clause for the acquire; the clause for the release will be
722                  * generated with the aggregating statement, below.
723                  */
724                 dprog_add("%s\n", info->ev_acquire);
725                 predicate_add(&pred, info->ev_predicate, NULL, 0);
726                 predicate_add(&pred, g_predicate, NULL, 0);
727                 if (pred != NULL)
728                         dprog_add("/%s/\n", pred);
729
730                 dprog_add("{\n");
731                 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
732
733                 if (info->ev_type == 'H') {
734                         dprog_add("\t%s = timestamp;\n", buf);
735                 } else {
736                         /*
737                          * If this isn't a hold event, it's the recursive
738                          * error event.  For this, we simply bump the
739                          * thread-local, per-lock count.
740                          */
741                         dprog_add("\t%s++;\n", buf);
742                 }
743
744                 dprog_add("}\n\n");
745                 predicate_destroy(&pred);
746                 pred = NULL;
747
748                 if (info->ev_type == 'E') {
749                         /*
750                          * If this is the recursive lock error event, we need
751                          * to generate an additional clause to decrement the
752                          * thread-local, per-lock count.  This assures that we
753                          * only execute the aggregating clause if we have
754                          * recursive entry.
755                          */
756                         dprog_add("%s\n", info->ev_name);
757                         dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
758                 }
759
760                 predicate_add(&pred, buf, NULL, 0);
761
762                 if (info->ev_type == 'H') {
763                         (void) sprintf(buf, "timestamp -\n\t    "
764                             "self->ev%d[(uintptr_t)arg0]", event);
765                 }
766
767                 arg1 = buf;
768         } else {
769                 predicate_add(&pred, info->ev_predicate, NULL, 0);
770                 if (info->ev_type != 'I')
771                         predicate_add(&pred, g_predicate, NULL, 0);
772                 else
773                         predicate_add(&pred, g_ipredicate, NULL, 0);
774         }
775
776         if ((dur = g_min_duration[event]) != 0)
777                 predicate_add(&pred, arg1, ">=", dur);
778
779         dprog_add("%s\n", info->ev_name);
780
781         if (pred != NULL)
782                 dprog_add("/%s/\n", pred);
783         predicate_destroy(&pred);
784
785         dprog_add("{\n");
786
787         if (g_tracing) {
788                 dprog_add("\ttrace(%dULL);\n", event);
789                 dprog_add("\ttrace(%s);\n", arg0);
790                 dprog_add("\ttrace(%s);\n", caller);
791                 dprog_add(stack);
792         } else {
793                 /*
794                  * The ordering here is important:  when we process the
795                  * aggregate, we count on the fact that @avg appears before
796                  * @hist in program order to assure that @avg is assigned the
797                  * first aggregation variable ID and @hist assigned the
798                  * second; see the comment in process_aggregate() for details.
799                  */
800                 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
801                     event, arg0, caller, stack, arg1);
802
803                 if (g_recsize >= LS_HIST) {
804                         dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
805                             "(%s);\n", event, arg0, caller, stack, arg1);
806                 }
807         }
808
809         if (info->ev_acquire != NULL)
810                 dprog_add("\tself->ev%d[arg0] = 0;\n", event);
811
812         dprog_add("}\n\n");
813 }
814
815 static void
816 dprog_compile()
817 {
818         dtrace_prog_t *prog;
819         dtrace_proginfo_t info;
820
821         if (g_Vflag) {
822                 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
823                 (void) fputs(g_prog, stderr);
824                 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
825         }
826
827         if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
828             DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
829                 dfail("failed to compile program");
830
831         if (dtrace_program_exec(g_dtp, prog, &info) == -1)
832                 dfail("failed to enable probes");
833
834         if (dtrace_go(g_dtp) != 0)
835                 dfail("couldn't start tracing");
836 }
837
838 static void
839 #if defined(sun)
840 status_fire(void)
841 #else
842 status_fire(int i)
843 #endif
844 {}
845
846 static void
847 status_init(void)
848 {
849         dtrace_optval_t val, status, agg;
850         struct sigaction act;
851         struct itimerspec ts;
852         struct sigevent ev;
853         timer_t tid;
854
855         if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
856                 dfail("failed to get 'statusrate'");
857
858         if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
859                 dfail("failed to get 'statusrate'");
860
861         /*
862          * We would want to awaken at a rate that is the GCD of the statusrate
863          * and the aggrate -- but that seems a bit absurd.  Instead, we'll
864          * simply awaken at a rate that is the more frequent of the two, which
865          * assures that we're never later than the interval implied by the
866          * more frequent rate.
867          */
868         val = status < agg ? status : agg;
869
870         (void) sigemptyset(&act.sa_mask);
871         act.sa_flags = 0;
872         act.sa_handler = status_fire;
873         (void) sigaction(SIGUSR1, &act, NULL);
874
875         ev.sigev_notify = SIGEV_SIGNAL;
876         ev.sigev_signo = SIGUSR1;
877
878         if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
879                 dfail("cannot create CLOCK_REALTIME timer");
880
881         ts.it_value.tv_sec = val / NANOSEC;
882         ts.it_value.tv_nsec = val % NANOSEC;
883         ts.it_interval = ts.it_value;
884
885         if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
886                 dfail("cannot set time on CLOCK_REALTIME timer");
887 }
888
889 static void
890 status_check(void)
891 {
892         if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
893                 dfail("failed to snap aggregate");
894
895         if (dtrace_status(g_dtp) == -1)
896                 dfail("dtrace_status()");
897 }
898
899 static void
900 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
901 {
902         bzero(lsrec, g_recsize);
903         lsrec->ls_count = 1;
904
905         if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
906                 fail(0, "truncated DTrace record");
907
908         if (rec->dtrd_size != sizeof (uint64_t))
909                 fail(0, "bad event size in first record");
910
911         /* LINTED - alignment */
912         lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
913         rec++;
914
915         if (rec->dtrd_size != sizeof (uintptr_t))
916                 fail(0, "bad lock address size in second record");
917
918         /* LINTED - alignment */
919         lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
920         rec++;
921
922         if (rec->dtrd_size != sizeof (uintptr_t))
923                 fail(0, "bad caller size in third record");
924
925         /* LINTED - alignment */
926         lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
927         rec++;
928
929         if (g_recsize > LS_HIST) {
930                 int frames, i;
931                 pc_t *stack;
932
933                 frames = rec->dtrd_size / sizeof (pc_t);
934                 /* LINTED - alignment */
935                 stack = (pc_t *)(data + rec->dtrd_offset);
936
937                 for (i = 1; i < frames; i++)
938                         lsrec->ls_stack[i - 1] = stack[i];
939         }
940 }
941
942 /*ARGSUSED*/
943 static int
944 count_aggregate(const dtrace_aggdata_t *agg, void *arg)
945 {
946         *((size_t *)arg) += 1;
947
948         return (DTRACE_AGGWALK_NEXT);
949 }
950
951 static int
952 process_aggregate(const dtrace_aggdata_t *agg, void *arg)
953 {
954         const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
955         caddr_t data = agg->dtada_data;
956         lsdata_t *lsdata = arg;
957         lsrec_t *lsrec = lsdata->lsd_next;
958         const dtrace_recdesc_t *rec;
959         uint64_t *avg, *quantized;
960         int i, j;
961
962         assert(lsdata->lsd_count < g_nrecs);
963
964         /*
965          * Aggregation variable IDs are guaranteed to be generated in program
966          * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
967          * plus one.  As "avg" appears before "hist" in program order, we know
968          * that "avg" will be allocated the first aggregation variable ID, and
969          * "hist" will be allocated the second aggregation variable ID -- and
970          * we therefore use the aggregation variable ID to differentiate the
971          * cases.
972          */
973         if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
974                 /*
975                  * If this is the histogram entry.  We'll copy the quantized
976                  * data into lc_hist, and jump over the rest.
977                  */
978                 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
979
980                 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
981                         fail(0, "bad variable ID in aggregation record");
982
983                 if (rec->dtrd_size !=
984                     DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
985                         fail(0, "bad quantize size in aggregation record");
986
987                 /* LINTED - alignment */
988                 quantized = (uint64_t *)(data + rec->dtrd_offset);
989
990                 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
991                     i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
992                         lsrec->ls_hist[j] = quantized[i];
993
994                 goto out;
995         }
996
997         lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
998             aggdesc->dtagd_nrecs - 1, data);
999
1000         rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1001
1002         if (rec->dtrd_size != 2 * sizeof (uint64_t))
1003                 fail(0, "bad avg size in aggregation record");
1004
1005         /* LINTED - alignment */
1006         avg = (uint64_t *)(data + rec->dtrd_offset);
1007         lsrec->ls_count = (uint32_t)avg[0];
1008         lsrec->ls_time = (uintptr_t)avg[1];
1009
1010         if (g_recsize >= LS_HIST)
1011                 return (DTRACE_AGGWALK_NEXT);
1012
1013 out:
1014         lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1015         lsdata->lsd_count++;
1016
1017         return (DTRACE_AGGWALK_NEXT);
1018 }
1019
1020 static int
1021 process_trace(const dtrace_probedata_t *pdata, void *arg)
1022 {
1023         lsdata_t *lsdata = arg;
1024         lsrec_t *lsrec = lsdata->lsd_next;
1025         dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
1026         caddr_t data = pdata->dtpda_data;
1027
1028         if (lsdata->lsd_count >= g_nrecs)
1029                 return (DTRACE_CONSUME_NEXT);
1030
1031         lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
1032
1033         lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1034         lsdata->lsd_count++;
1035
1036         return (DTRACE_CONSUME_NEXT);
1037 }
1038
1039 static int
1040 process_data(FILE *out, char *data)
1041 {
1042         lsdata_t lsdata;
1043
1044         /* LINTED - alignment */
1045         lsdata.lsd_next = (lsrec_t *)data;
1046         lsdata.lsd_count = 0;
1047
1048         if (g_tracing) {
1049                 if (dtrace_consume(g_dtp, out,
1050                     process_trace, NULL, &lsdata) != 0)
1051                         dfail("failed to consume buffer");
1052
1053                 return (lsdata.lsd_count);
1054         }
1055
1056         if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1057             process_aggregate, &lsdata) != 0)
1058                 dfail("failed to walk aggregate");
1059
1060         return (lsdata.lsd_count);
1061 }
1062
1063 /*ARGSUSED*/
1064 static int
1065 drophandler(const dtrace_dropdata_t *data, void *arg)
1066 {
1067         g_dropped++;
1068         (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1069         return (DTRACE_HANDLE_OK);
1070 }
1071
1072 int
1073 main(int argc, char **argv)
1074 {
1075         char *data_buf;
1076         lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1077         FILE *out = stdout;
1078         int c;
1079         pid_t child;
1080         int status;
1081         int i, j;
1082         hrtime_t duration;
1083         char *addrp, *offp, *sizep, *evp, *lastp, *p;
1084         uintptr_t addr;
1085         size_t size, off;
1086         int events_specified = 0;
1087         int exec_errno = 0;
1088         uint32_t event;
1089         char *filt = NULL, *ifilt = NULL;
1090         static uint64_t ev_count[LS_MAX_EVENTS + 1];
1091         static uint64_t ev_time[LS_MAX_EVENTS + 1];
1092         dtrace_optval_t aggsize;
1093         char aggstr[10];
1094         long ncpus;
1095         int dynvar = 0;
1096         int err;
1097
1098         if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1099                 fail(0, "cannot open dtrace library: %s",
1100                     dtrace_errmsg(NULL, err));
1101         }
1102
1103         if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1104                 dfail("couldn't establish drop handler");
1105
1106         if (symtab_init() == -1)
1107                 fail(1, "can't load kernel symbols");
1108
1109         g_nrecs = DEFAULT_NRECS;
1110
1111         while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1112                 switch (c) {
1113                 case 'b':
1114                         g_recsize = LS_BASIC;
1115                         break;
1116
1117                 case 't':
1118                         g_recsize = LS_TIME;
1119                         break;
1120
1121                 case 'h':
1122                         g_recsize = LS_HIST;
1123                         break;
1124
1125                 case 's':
1126                         if (!isdigit(optarg[0]))
1127                                 usage();
1128                         g_stkdepth = atoi(optarg);
1129                         if (g_stkdepth > LS_MAX_STACK_DEPTH)
1130                                 fail(0, "max stack depth is %d",
1131                                     LS_MAX_STACK_DEPTH);
1132                         g_recsize = LS_STACK(g_stkdepth);
1133                         break;
1134
1135                 case 'n':
1136                         if (!isdigit(optarg[0]))
1137                                 usage();
1138                         g_nrecs = atoi(optarg);
1139                         break;
1140
1141                 case 'd':
1142                         if (!isdigit(optarg[0]))
1143                                 usage();
1144                         duration = atoll(optarg);
1145
1146                         /*
1147                          * XXX -- durations really should be per event
1148                          * since the units are different, but it's hard
1149                          * to express this nicely in the interface.
1150                          * Not clear yet what the cleanest solution is.
1151                          */
1152                         for (i = 0; i < LS_MAX_EVENTS; i++)
1153                                 if (g_event_info[i].ev_type != 'E')
1154                                         g_min_duration[i] = duration;
1155
1156                         break;
1157
1158                 case 'i':
1159                         if (!isdigit(optarg[0]))
1160                                 usage();
1161                         i = atoi(optarg);
1162                         if (i <= 0)
1163                                 usage();
1164                         if (i > MAX_HZ)
1165                                 fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1166
1167                         for (j = 0; j < LS_MAX_EVENTS; j++)
1168                                 if (strcmp(g_event_info[j].ev_desc,
1169                                     "Profiling interrupt") == 0)
1170                                         break;
1171
1172                         (void) sprintf(g_event_info[j].ev_name,
1173                             "profile:::profile-%d", i);
1174                         break;
1175
1176                 case 'l':
1177                 case 'f':
1178                         addrp = strtok(optarg, ",");
1179                         sizep = strtok(NULL, ",");
1180                         addrp = strtok(optarg, ",+");
1181                         offp = strtok(NULL, ",");
1182
1183                         size = sizep ? strtoul(sizep, NULL, 0) : 1;
1184                         off = offp ? strtoul(offp, NULL, 0) : 0;
1185
1186                         if (addrp[0] == '0') {
1187                                 addr = strtoul(addrp, NULL, 16) + off;
1188                         } else {
1189                                 addr = sym_to_addr(addrp) + off;
1190                                 if (sizep == NULL)
1191                                         size = sym_size(addrp) - off;
1192                                 if (addr - off == 0)
1193                                         fail(0, "symbol '%s' not found", addrp);
1194                                 if (size == 0)
1195                                         size = 1;
1196                         }
1197
1198
1199                         if (c == 'l') {
1200                                 filter_add(&filt, "arg0", addr, size);
1201                         } else {
1202                                 filter_add(&filt, "caller", addr, size);
1203                                 filter_add(&ifilt, "arg0", addr, size);
1204                         }
1205                         break;
1206
1207                 case 'e':
1208                         evp = strtok_r(optarg, ",", &lastp);
1209                         while (evp) {
1210                                 int ev1, ev2;
1211                                 char *evp2;
1212
1213                                 (void) strtok(evp, "-");
1214                                 evp2 = strtok(NULL, "-");
1215                                 ev1 = atoi(evp);
1216                                 ev2 = evp2 ? atoi(evp2) : ev1;
1217                                 if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1218                                     (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1219                                         fail(0, "-e events out of range");
1220                                 for (i = ev1; i <= ev2; i++)
1221                                         g_enabled[i] = 1;
1222                                 evp = strtok_r(NULL, ",", &lastp);
1223                         }
1224                         events_specified = 1;
1225                         break;
1226
1227                 case 'c':
1228                         g_cflag = 1;
1229                         break;
1230
1231                 case 'k':
1232                         g_kflag = 1;
1233                         break;
1234
1235                 case 'w':
1236                         g_wflag = 1;
1237                         break;
1238
1239                 case 'W':
1240                         g_Wflag = 1;
1241                         break;
1242
1243                 case 'g':
1244                         g_gflag = 1;
1245                         break;
1246
1247                 case 'C':
1248                 case 'E':
1249                 case 'H':
1250                 case 'I':
1251                         for (i = 0; i < LS_MAX_EVENTS; i++)
1252                                 if (g_event_info[i].ev_type == c)
1253                                         g_enabled[i] = 1;
1254                         events_specified = 1;
1255                         break;
1256
1257                 case 'A':
1258                         for (i = 0; i < LS_MAX_EVENTS; i++)
1259                                 if (strchr("CH", g_event_info[i].ev_type))
1260                                         g_enabled[i] = 1;
1261                         events_specified = 1;
1262                         break;
1263
1264                 case 'T':
1265                         g_tracing = 1;
1266                         break;
1267
1268                 case 'D':
1269                         if (!isdigit(optarg[0]))
1270                                 usage();
1271                         g_topn = atoi(optarg);
1272                         break;
1273
1274                 case 'R':
1275                         g_rates = 1;
1276                         break;
1277
1278                 case 'p':
1279                         g_pflag = 1;
1280                         break;
1281
1282                 case 'P':
1283                         g_Pflag = 1;
1284                         break;
1285
1286                 case 'o':
1287                         if ((out = fopen(optarg, "w")) == NULL)
1288                                 fail(1, "error opening file");
1289                         break;
1290
1291                 case 'V':
1292                         g_Vflag = 1;
1293                         break;
1294
1295                 default:
1296                         if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1297                                 usage();
1298                 }
1299         }
1300
1301         if (filt != NULL) {
1302                 predicate_add(&g_predicate, filt, NULL, 0);
1303                 filter_destroy(&filt);
1304         }
1305
1306         if (ifilt != NULL) {
1307                 predicate_add(&g_ipredicate, ifilt, NULL, 0);
1308                 filter_destroy(&ifilt);
1309         }
1310
1311         if (g_recsize == 0) {
1312                 if (g_gflag) {
1313                         g_stkdepth = LS_MAX_STACK_DEPTH;
1314                         g_recsize = LS_STACK(g_stkdepth);
1315                 } else {
1316                         g_recsize = LS_TIME;
1317                 }
1318         }
1319
1320         if (g_gflag && g_recsize <= LS_STACK(0))
1321                 fail(0, "'-g' requires at least '-s 1' data gathering");
1322
1323         /*
1324          * Make sure the alignment is reasonable
1325          */
1326         g_recsize = -(-g_recsize & -sizeof (uint64_t));
1327
1328         for (i = 0; i < LS_MAX_EVENTS; i++) {
1329                 /*
1330                  * If no events were specified, enable -C.
1331                  */
1332                 if (!events_specified && g_event_info[i].ev_type == 'C')
1333                         g_enabled[i] = 1;
1334         }
1335
1336         for (i = 0; i < LS_MAX_EVENTS; i++) {
1337                 if (!g_enabled[i])
1338                         continue;
1339
1340                 if (g_event_info[i].ev_acquire != NULL) {
1341                         /*
1342                          * If we've enabled a hold event, we must explicitly
1343                          * allocate dynamic variable space.
1344                          */
1345                         dynvar = 1;
1346                 }
1347
1348                 dprog_addevent(i);
1349         }
1350
1351         /*
1352          * Make sure there are remaining arguments to specify a child command
1353          * to execute.
1354          */
1355         if (argc <= optind)
1356                 usage();
1357
1358         if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1359                 dfail("couldn't determine number of online CPUs");
1360
1361         /*
1362          * By default, we set our data buffer size to be the number of records
1363          * multiplied by the size of the record, doubled to account for some
1364          * DTrace slop and divided by the number of CPUs.  We silently clamp
1365          * the aggregation size at both a minimum and a maximum to prevent
1366          * absurdly low or high values.
1367          */
1368         if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1369                 aggsize = MIN_AGGSIZE;
1370
1371         if (aggsize > MAX_AGGSIZE)
1372                 aggsize = MAX_AGGSIZE;
1373
1374         (void) sprintf(aggstr, "%lld", (long long)aggsize);
1375
1376         if (!g_tracing) {
1377                 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1378                         dfail("failed to set 'bufsize'");
1379
1380                 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1381                         dfail("failed to set 'aggsize'");
1382
1383                 if (dynvar) {
1384                         /*
1385                          * If we're using dynamic variables, we set our
1386                          * dynamic variable size to be one megabyte per CPU,
1387                          * with a hard-limit of 32 megabytes.  This may still
1388                          * be too small in some cases, but it can be tuned
1389                          * manually via -x if need be.
1390                          */
1391                         (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1392
1393                         if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1394                                 dfail("failed to set 'dynvarsize'");
1395                 }
1396         } else {
1397                 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1398                         dfail("failed to set 'bufsize'");
1399         }
1400
1401         if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1402                 dfail("failed to set 'statusrate'");
1403
1404         optind = 1;
1405         while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1406                 switch (c) {
1407                 case 'x':
1408                         if ((p = strchr(optarg, '=')) != NULL)
1409                                 *p++ = '\0';
1410
1411                         if (dtrace_setopt(g_dtp, optarg, p) != 0)
1412                                 dfail("failed to set -x %s", optarg);
1413                         break;
1414                 }
1415         }
1416
1417         argc -= optind;
1418         argv += optind;
1419
1420         dprog_compile();
1421         status_init();
1422
1423         g_elapsed = -gethrtime();
1424
1425         /*
1426          * Spawn the specified command and wait for it to complete.
1427          */
1428         child = fork();
1429         if (child == -1)
1430                 fail(1, "cannot fork");
1431         if (child == 0) {
1432                 (void) dtrace_close(g_dtp);
1433                 (void) execvp(argv[0], &argv[0]);
1434                 exec_errno = errno;
1435                 exit(127);
1436         }
1437
1438 #if defined(sun)
1439         while (waitpid(child, &status, WEXITED) != child)
1440 #else
1441         while (waitpid(child, &status, 0) != child)
1442 #endif
1443                 status_check();
1444
1445         g_elapsed += gethrtime();
1446
1447         if (WIFEXITED(status)) {
1448                 if (WEXITSTATUS(status) != 0) {
1449                         if (exec_errno != 0) {
1450                                 errno = exec_errno;
1451                                 fail(1, "could not execute %s", argv[0]);
1452                         }
1453                         (void) fprintf(stderr,
1454                             "lockstat: warning: %s exited with code %d\n",
1455                             argv[0], WEXITSTATUS(status));
1456                 }
1457         } else {
1458                 (void) fprintf(stderr,
1459                     "lockstat: warning: %s died on signal %d\n",
1460                     argv[0], WTERMSIG(status));
1461         }
1462
1463         if (dtrace_stop(g_dtp) == -1)
1464                 dfail("failed to stop dtrace");
1465
1466         /*
1467          * Before we read out the results, we need to allocate our buffer.
1468          * If we're tracing, then we'll just use the precalculated size.  If
1469          * we're not, then we'll take a snapshot of the aggregate, and walk
1470          * it to count the number of records.
1471          */
1472         if (!g_tracing) {
1473                 if (dtrace_aggregate_snap(g_dtp) != 0)
1474                         dfail("failed to snap aggregate");
1475
1476                 g_nrecs = 0;
1477
1478                 if (dtrace_aggregate_walk(g_dtp,
1479                     count_aggregate, &g_nrecs) != 0)
1480                         dfail("failed to walk aggregate");
1481         }
1482
1483 #if defined(sun)
1484         if ((data_buf = memalign(sizeof (uint64_t),
1485             (g_nrecs + 1) * g_recsize)) == NULL)
1486 #else
1487         if (posix_memalign((void **)&data_buf, sizeof (uint64_t),  
1488             (g_nrecs + 1) * g_recsize) )
1489 #endif
1490                 fail(1, "Memory allocation failed");
1491
1492         /*
1493          * Read out the DTrace data.
1494          */
1495         g_nrecs_used = process_data(out, data_buf);
1496
1497         if (g_nrecs_used > g_nrecs || g_dropped)
1498                 (void) fprintf(stderr, "lockstat: warning: "
1499                     "ran out of data records (use -n for more)\n");
1500
1501         /* LINTED - alignment */
1502         for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1503             /* LINTED - alignment */
1504             lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1505                 ev_count[lsp->ls_event] += lsp->ls_count;
1506                 ev_time[lsp->ls_event] += lsp->ls_time;
1507         }
1508
1509         /*
1510          * If -g was specified, convert stacks into individual records.
1511          */
1512         if (g_gflag) {
1513                 lsrec_t *newlsp, *oldlsp;
1514
1515 #if defined(sun)
1516                 newlsp = memalign(sizeof (uint64_t),
1517                     g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1518 #else
1519                 posix_memalign((void **)&newlsp, sizeof (uint64_t), 
1520                     g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1521 #endif
1522                 if (newlsp == NULL)
1523                         fail(1, "Cannot allocate space for -g processing");
1524                 lsp = newlsp;
1525                 /* LINTED - alignment */
1526                 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1527                     /* LINTED - alignment */
1528                     oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1529                         int fr;
1530                         int caller_in_stack = 0;
1531
1532                         if (oldlsp->ls_count == 0)
1533                                 continue;
1534
1535                         for (fr = 0; fr < g_stkdepth; fr++) {
1536                                 if (oldlsp->ls_stack[fr] == 0)
1537                                         break;
1538                                 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1539                                         caller_in_stack = 1;
1540                                 bcopy(oldlsp, lsp, LS_TIME);
1541                                 lsp->ls_caller = oldlsp->ls_stack[fr];
1542                                 /* LINTED - alignment */
1543                                 lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1544                         }
1545                         if (!caller_in_stack) {
1546                                 bcopy(oldlsp, lsp, LS_TIME);
1547                                 /* LINTED - alignment */
1548                                 lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1549                         }
1550                 }
1551                 g_nrecs = g_nrecs_used =
1552                     ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1553                 g_recsize = LS_TIME;
1554                 g_stkdepth = 0;
1555                 free(data_buf);
1556                 data_buf = (char *)newlsp;
1557         }
1558
1559         if ((sort_buf = calloc(2 * (g_nrecs + 1),
1560             sizeof (void *))) == NULL)
1561                 fail(1, "Sort buffer allocation failed");
1562         merge_buf = sort_buf + (g_nrecs + 1);
1563
1564         /*
1565          * Build the sort buffer, discarding zero-count records along the way.
1566          */
1567         /* LINTED - alignment */
1568         for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1569             /* LINTED - alignment */
1570             lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1571                 if (lsp->ls_count == 0)
1572                         lsp->ls_event = LS_MAX_EVENTS;
1573                 sort_buf[i] = lsp;
1574         }
1575
1576         if (g_nrecs_used == 0)
1577                 exit(0);
1578
1579         /*
1580          * Add a sentinel after the last record
1581          */
1582         sort_buf[i] = lsp;
1583         lsp->ls_event = LS_MAX_EVENTS;
1584
1585         if (g_tracing) {
1586                 report_trace(out, sort_buf);
1587                 return (0);
1588         }
1589
1590         /*
1591          * Application of -g may have resulted in multiple records
1592          * with the same signature; coalesce them.
1593          */
1594         if (g_gflag) {
1595                 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1596                 coalesce(lockcmp, sort_buf, g_nrecs_used);
1597         }
1598
1599         /*
1600          * Coalesce locks within the same symbol if -c option specified.
1601          * Coalesce PCs within the same function if -k option specified.
1602          */
1603         if (g_cflag || g_kflag) {
1604                 for (i = 0; i < g_nrecs_used; i++) {
1605                         int fr;
1606                         lsp = sort_buf[i];
1607                         if (g_cflag)
1608                                 coalesce_symbol(&lsp->ls_lock);
1609                         if (g_kflag) {
1610                                 for (fr = 0; fr < g_stkdepth; fr++)
1611                                         coalesce_symbol(&lsp->ls_stack[fr]);
1612                                 coalesce_symbol(&lsp->ls_caller);
1613                         }
1614                 }
1615                 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1616                 coalesce(lockcmp, sort_buf, g_nrecs_used);
1617         }
1618
1619         /*
1620          * Coalesce callers if -w option specified
1621          */
1622         if (g_wflag) {
1623                 mergesort(lock_and_count_cmp_anywhere,
1624                     sort_buf, merge_buf, g_nrecs_used);
1625                 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1626         }
1627
1628         /*
1629          * Coalesce locks if -W option specified
1630          */
1631         if (g_Wflag) {
1632                 mergesort(site_and_count_cmp_anylock,
1633                     sort_buf, merge_buf, g_nrecs_used);
1634                 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1635         }
1636
1637         /*
1638          * Sort data by contention count (ls_count) or total time (ls_time),
1639          * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1640          */
1641         if (g_recsize < LS_TIME)
1642                 g_Pflag = 0;
1643
1644         if (g_Pflag)
1645                 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1646         else
1647                 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1648
1649         /*
1650          * Display data by event type
1651          */
1652         first = &sort_buf[0];
1653         while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1654                 current = first;
1655                 while ((lsp = *current)->ls_event == event)
1656                         current++;
1657                 report_stats(out, first, current - first, ev_count[event],
1658                     ev_time[event]);
1659                 first = current;
1660         }
1661
1662         return (0);
1663 }
1664
1665 static char *
1666 format_symbol(char *buf, uintptr_t addr, int show_size)
1667 {
1668         uintptr_t symoff;
1669         char *symname;
1670         size_t symsize;
1671
1672         symname = addr_to_sym(addr, &symoff, &symsize);
1673
1674         if (show_size && symoff == 0)
1675                 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1676         else if (symoff == 0)
1677                 (void) sprintf(buf, "%s", symname);
1678         else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)  /* CPU+PIL */
1679 #if defined(sun)
1680                 (void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1681 #else
1682                 (void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]);
1683 #endif
1684         else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1685                 (void) sprintf(buf, "%s+0x%llx", symname,
1686                     (unsigned long long)symoff);
1687         else
1688                 (void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1689         return (buf);
1690 }
1691
1692 static void
1693 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1694         uint64_t total_time)
1695 {
1696         uint32_t event = sort_buf[0]->ls_event;
1697         lsrec_t *lsp;
1698         double ptotal = 0.0;
1699         double percent;
1700         int i, j, fr;
1701         int displayed;
1702         int first_bin, last_bin, max_bin_count, total_bin_count;
1703         int rectype;
1704         char buf[256];
1705         char lhdr[80], chdr[80];
1706
1707         rectype = g_recsize;
1708
1709         if (g_topn == 0) {
1710                 (void) fprintf(out, "%20llu %s\n",
1711                     g_rates == 0 ? total_count :
1712                     ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1713                     g_event_info[event].ev_desc);
1714                 return;
1715         }
1716
1717         (void) sprintf(lhdr, "%s%s",
1718             g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1719         (void) sprintf(chdr, "%s%s",
1720             g_wflag ? "Hottest " : "", "Caller");
1721
1722         if (!g_pflag)
1723                 (void) fprintf(out,
1724                     "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1725                     g_event_info[event].ev_desc, (double)total_count,
1726                     (double)g_elapsed / NANOSEC,
1727                     (double)total_count * NANOSEC / g_elapsed);
1728
1729         if (!g_pflag && rectype < LS_HIST) {
1730                 (void) sprintf(buf, "%s", g_event_info[event].ev_units);
1731                 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1732                     g_rates ? "ops/s" : "Count",
1733                     g_gflag ? "genr" : "indv",
1734                     "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1735                 (void) fprintf(out, "---------------------------------"
1736                     "----------------------------------------------\n");
1737         }
1738
1739         displayed = 0;
1740         for (i = 0; i < nrecs; i++) {
1741                 lsp = sort_buf[i];
1742
1743                 if (displayed++ >= g_topn)
1744                         break;
1745
1746                 if (g_pflag) {
1747                         int j;
1748
1749                         (void) fprintf(out, "%u %u",
1750                             lsp->ls_event, lsp->ls_count);
1751                         (void) fprintf(out, " %s",
1752                             format_symbol(buf, lsp->ls_lock, g_cflag));
1753                         (void) fprintf(out, " %s",
1754                             format_symbol(buf, lsp->ls_caller, 0));
1755                         (void) fprintf(out, " %f",
1756                             (double)lsp->ls_refcnt / lsp->ls_count);
1757                         if (rectype >= LS_TIME)
1758                                 (void) fprintf(out, " %llu",
1759                                     (unsigned long long)lsp->ls_time);
1760                         if (rectype >= LS_HIST) {
1761                                 for (j = 0; j < 64; j++)
1762                                         (void) fprintf(out, " %u",
1763                                             lsp->ls_hist[j]);
1764                         }
1765                         for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1766                                 if (rectype <= LS_STACK(j) ||
1767                                     lsp->ls_stack[j] == 0)
1768                                         break;
1769                                 (void) fprintf(out, " %s",
1770                                     format_symbol(buf, lsp->ls_stack[j], 0));
1771                         }
1772                         (void) fprintf(out, "\n");
1773                         continue;
1774                 }
1775
1776                 if (rectype >= LS_HIST) {
1777                         (void) fprintf(out, "---------------------------------"
1778                             "----------------------------------------------\n");
1779                         (void) sprintf(buf, "%s",
1780                             g_event_info[event].ev_units);
1781                         (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1782                             g_rates ? "ops/s" : "Count",
1783                             g_gflag ? "genr" : "indv",
1784                             "cuml", "rcnt", buf, lhdr, chdr);
1785                 }
1786
1787                 if (g_Pflag && total_time != 0)
1788                         percent = (lsp->ls_time * 100.00) / total_time;
1789                 else
1790                         percent = (lsp->ls_count * 100.00) / total_count;
1791
1792                 ptotal += percent;
1793
1794                 if (rectype >= LS_TIME)
1795                         (void) sprintf(buf, "%llu",
1796                             (unsigned long long)(lsp->ls_time / lsp->ls_count));
1797                 else
1798                         buf[0] = '\0';
1799
1800                 (void) fprintf(out, "%5llu ",
1801                     g_rates == 0 ? lsp->ls_count :
1802                     ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1803
1804                 (void) fprintf(out, "%3.0f%% ", percent);
1805
1806                 if (g_gflag)
1807                         (void) fprintf(out, "---- ");
1808                 else
1809                         (void) fprintf(out, "%3.0f%% ", ptotal);
1810
1811                 (void) fprintf(out, "%4.2f %8s ",
1812                     (double)lsp->ls_refcnt / lsp->ls_count, buf);
1813
1814                 (void) fprintf(out, "%-22s ",
1815                     format_symbol(buf, lsp->ls_lock, g_cflag));
1816
1817                 (void) fprintf(out, "%-24s\n",
1818                     format_symbol(buf, lsp->ls_caller, 0));
1819
1820                 if (rectype < LS_HIST)
1821                         continue;
1822
1823                 (void) fprintf(out, "\n");
1824                 (void) fprintf(out, "%10s %31s %-9s %-24s\n",
1825                     g_event_info[event].ev_units,
1826                     "------ Time Distribution ------",
1827                     g_rates ? "ops/s" : "count",
1828                     rectype > LS_STACK(0) ? "Stack" : "");
1829
1830                 first_bin = 0;
1831                 while (lsp->ls_hist[first_bin] == 0)
1832                         first_bin++;
1833
1834                 last_bin = 63;
1835                 while (lsp->ls_hist[last_bin] == 0)
1836                         last_bin--;
1837
1838                 max_bin_count = 0;
1839                 total_bin_count = 0;
1840                 for (j = first_bin; j <= last_bin; j++) {
1841                         total_bin_count += lsp->ls_hist[j];
1842                         if (lsp->ls_hist[j] > max_bin_count)
1843                                 max_bin_count = lsp->ls_hist[j];
1844                 }
1845
1846                 /*
1847                  * If we went a few frames below the caller, ignore them
1848                  */
1849                 for (fr = 3; fr > 0; fr--)
1850                         if (lsp->ls_stack[fr] == lsp->ls_caller)
1851                                 break;
1852
1853                 for (j = first_bin; j <= last_bin; j++) {
1854                         uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1855                         (void) fprintf(out, "%10llu |%s%s %-9u ",
1856                             1ULL << j,
1857                             "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1858                             "                              " + depth,
1859                             g_rates == 0 ? lsp->ls_hist[j] :
1860                             (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1861                             g_elapsed));
1862                         if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1863                                 (void) fprintf(out, "\n");
1864                                 continue;
1865                         }
1866                         (void) fprintf(out, "%-24s\n",
1867                             format_symbol(buf, lsp->ls_stack[fr], 0));
1868                         fr++;
1869                 }
1870                 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1871                         (void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1872                             format_symbol(buf, lsp->ls_stack[fr], 0));
1873                         fr++;
1874                 }
1875         }
1876
1877         if (!g_pflag)
1878                 (void) fprintf(out, "---------------------------------"
1879                     "----------------------------------------------\n");
1880
1881         (void) fflush(out);
1882 }
1883
1884 static void
1885 report_trace(FILE *out, lsrec_t **sort_buf)
1886 {
1887         lsrec_t *lsp;
1888         int i, fr;
1889         int rectype;
1890         char buf[256], buf2[256];
1891
1892         rectype = g_recsize;
1893
1894         if (!g_pflag) {
1895                 (void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1896                     "Event", "Time", "Owner", "Lock", "Caller");
1897                 (void) fprintf(out, "---------------------------------"
1898                     "----------------------------------------------\n");
1899         }
1900
1901         for (i = 0; i < g_nrecs_used; i++) {
1902
1903                 lsp = sort_buf[i];
1904
1905                 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1906                         continue;
1907
1908                 (void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1909                     lsp->ls_event, (unsigned long long)lsp->ls_time,
1910                     (void *)lsp->ls_next,
1911                     format_symbol(buf, lsp->ls_lock, 0),
1912                     format_symbol(buf2, lsp->ls_caller, 0));
1913
1914                 if (rectype <= LS_STACK(0))
1915                         continue;
1916
1917                 /*
1918                  * If we went a few frames below the caller, ignore them
1919                  */
1920                 for (fr = 3; fr > 0; fr--)
1921                         if (lsp->ls_stack[fr] == lsp->ls_caller)
1922                                 break;
1923
1924                 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1925                         (void) fprintf(out, "%53s  %-24s\n", "",
1926                             format_symbol(buf, lsp->ls_stack[fr], 0));
1927                         fr++;
1928                 }
1929                 (void) fprintf(out, "\n");
1930         }
1931
1932         (void) fflush(out);
1933 }