]> CyberLeo.Net >> Repos - FreeBSD/stable/10.git/blob - cddl/contrib/opensolaris/cmd/lockstat/lockstat.c
MFC r277300 (by smh): Mechanically convert cddl sun #ifdef's to illumos
[FreeBSD/stable/10.git] / cddl / contrib / opensolaris / cmd / lockstat / lockstat.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #pragma ident   "%Z%%M% %I%     %E% SMI"
27
28 #include <stdio.h>
29 #include <stddef.h>
30 #include <stdlib.h>
31 #include <stdarg.h>
32 #include <string.h>
33 #include <strings.h>
34 #include <ctype.h>
35 #include <fcntl.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <sys/types.h>
40 #include <sys/modctl.h>
41 #include <sys/stat.h>
42 #include <sys/wait.h>
43 #include <dtrace.h>
44 #include <sys/lockstat.h>
45 #include <alloca.h>
46 #include <signal.h>
47 #include <assert.h>
48
49 #ifdef illumos
50 #define GETOPT_EOF      EOF
51 #else
52 #include <sys/time.h>
53 #include <sys/resource.h>
54
55 #define mergesort(a, b, c, d)   lsmergesort(a, b, c, d)
56 #define GETOPT_EOF              (-1)
57
58 typedef uintptr_t       pc_t;
59 #endif
60
61 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
62
63 #define LS_MAX_STACK_DEPTH      50
64 #define LS_MAX_EVENTS           64
65
66 typedef struct lsrec {
67         struct lsrec    *ls_next;       /* next in hash chain */
68         uintptr_t       ls_lock;        /* lock address */
69         uintptr_t       ls_caller;      /* caller address */
70         uint32_t        ls_count;       /* cumulative event count */
71         uint32_t        ls_event;       /* type of event */
72         uintptr_t       ls_refcnt;      /* cumulative reference count */
73         uint64_t        ls_time;        /* cumulative event duration */
74         uint32_t        ls_hist[64];    /* log2(duration) histogram */
75         uintptr_t       ls_stack[LS_MAX_STACK_DEPTH];
76 } lsrec_t;
77
78 typedef struct lsdata {
79         struct lsrec    *lsd_next;      /* next available */
80         int             lsd_count;      /* number of records */
81 } lsdata_t;
82
83 /*
84  * Definitions for the types of experiments which can be run.  They are
85  * listed in increasing order of memory cost and processing time cost.
86  * The numerical value of each type is the number of bytes needed per record.
87  */
88 #define LS_BASIC        offsetof(lsrec_t, ls_time)
89 #define LS_TIME         offsetof(lsrec_t, ls_hist[0])
90 #define LS_HIST         offsetof(lsrec_t, ls_stack[0])
91 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth])
92
93 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
94 static void report_trace(FILE *, lsrec_t **);
95
96 extern int symtab_init(void);
97 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
98 extern uintptr_t sym_to_addr(char *name);
99 extern size_t sym_size(char *name);
100 extern char *strtok_r(char *, const char *, char **);
101
102 #define DEFAULT_NRECS   10000
103 #define DEFAULT_HZ      97
104 #define MAX_HZ          1000
105 #define MIN_AGGSIZE     (16 * 1024)
106 #define MAX_AGGSIZE     (32 * 1024 * 1024)
107
108 static int g_stkdepth;
109 static int g_topn = INT_MAX;
110 static hrtime_t g_elapsed;
111 static int g_rates = 0;
112 static int g_pflag = 0;
113 static int g_Pflag = 0;
114 static int g_wflag = 0;
115 static int g_Wflag = 0;
116 static int g_cflag = 0;
117 static int g_kflag = 0;
118 static int g_gflag = 0;
119 static int g_Vflag = 0;
120 static int g_tracing = 0;
121 static size_t g_recsize;
122 static size_t g_nrecs;
123 static int g_nrecs_used;
124 static uchar_t g_enabled[LS_MAX_EVENTS];
125 static hrtime_t g_min_duration[LS_MAX_EVENTS];
126 static dtrace_hdl_t *g_dtp;
127 static char *g_predicate;
128 static char *g_ipredicate;
129 static char *g_prog;
130 static int g_proglen;
131 static int g_dropped;
132
133 typedef struct ls_event_info {
134         char    ev_type;
135         char    ev_lhdr[20];
136         char    ev_desc[80];
137         char    ev_units[10];
138         char    ev_name[DTRACE_NAMELEN];
139         char    *ev_predicate;
140         char    *ev_acquire;
141 } ls_event_info_t;
142
143 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
144         { 'C',  "Lock", "Adaptive mutex spin",                  "nsec",
145             "lockstat:::adaptive-spin" },
146         { 'C',  "Lock", "Adaptive mutex block",                 "nsec",
147             "lockstat:::adaptive-block" },
148         { 'C',  "Lock", "Spin lock spin",                       "nsec",
149             "lockstat:::spin-spin" },
150         { 'C',  "Lock", "Thread lock spin",                     "nsec",
151             "lockstat:::thread-spin" },
152         { 'C',  "Lock", "R/W writer blocked by writer",         "nsec",
153             "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
154         { 'C',  "Lock", "R/W writer blocked by readers",        "nsec",
155             "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
156         { 'C',  "Lock", "R/W reader blocked by writer",         "nsec",
157             "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
158         { 'C',  "Lock", "R/W reader blocked by write wanted",   "nsec",
159             "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
160         { 'C',  "Lock", "R/W writer spin on writer",            "nsec",
161             "lockstat:::rw-spin", "arg2 == 0 && arg3 == 1" },
162         { 'C',  "Lock", "R/W writer spin on readers",           "nsec",
163             "lockstat:::rw-spin", "arg2 == 0 && arg3 == 0 && arg4" },
164         { 'C',  "Lock", "R/W reader spin on writer",            "nsec",
165             "lockstat:::rw-spin", "arg2 != 0 && arg3 == 1" },
166         { 'C',  "Lock", "R/W reader spin on write wanted",      "nsec",
167             "lockstat:::rw-spin", "arg2 != 0 && arg3 == 0 && arg4" },
168         { 'C',  "Lock", "SX exclusive block",                   "nsec",
169             "lockstat:::sx-block", "arg2 == 0" },
170         { 'C',  "Lock", "SX shared block",                      "nsec",
171             "lockstat:::sx-block", "arg2 != 0" },
172         { 'C',  "Lock", "SX exclusive spin",                    "nsec",
173             "lockstat:::sx-spin", "arg2 == 0" },
174         { 'C',  "Lock", "SX shared spin",                       "nsec",
175             "lockstat:::sx-spin", "arg2 != 0" },
176         { 'C',  "Lock", "Unknown event (type 16)",              "units" },
177         { 'C',  "Lock", "Unknown event (type 17)",              "units" },
178         { 'C',  "Lock", "Unknown event (type 18)",              "units" },
179         { 'C',  "Lock", "Unknown event (type 19)",              "units" },
180         { 'C',  "Lock", "Unknown event (type 20)",              "units" },
181         { 'C',  "Lock", "Unknown event (type 21)",              "units" },
182         { 'C',  "Lock", "Unknown event (type 22)",              "units" },
183         { 'C',  "Lock", "Unknown event (type 23)",              "units" },
184         { 'C',  "Lock", "Unknown event (type 24)",              "units" },
185         { 'C',  "Lock", "Unknown event (type 25)",              "units" },
186         { 'C',  "Lock", "Unknown event (type 26)",              "units" },
187         { 'C',  "Lock", "Unknown event (type 27)",              "units" },
188         { 'C',  "Lock", "Unknown event (type 28)",              "units" },
189         { 'C',  "Lock", "Unknown event (type 29)",              "units" },
190         { 'C',  "Lock", "Unknown event (type 30)",              "units" },
191         { 'C',  "Lock", "Unknown event (type 31)",              "units" },
192         { 'H',  "Lock", "Adaptive mutex hold",                  "nsec",
193             "lockstat:::adaptive-release", NULL,
194             "lockstat:::adaptive-acquire" },
195         { 'H',  "Lock", "Spin lock hold",                       "nsec",
196             "lockstat:::spin-release", NULL,
197             "lockstat:::spin-acquire" },
198         { 'H',  "Lock", "R/W writer hold",                      "nsec",
199             "lockstat::rw_wunlock:rw-release", NULL,
200             "lockstat::rw_wlock:rw-acquire" },
201         { 'H',  "Lock", "R/W reader hold",                      "nsec",
202             "lockstat::rw_runlock:rw-release", NULL,
203             "lockstat::rw_rlock:rw-acquire" },
204         { 'H',  "Lock", "SX shared hold",                       "nsec",
205             "lockstat::sx_sunlock:sx-release", NULL,
206             "lockstat::sx_slock:sx-acquire" },
207         { 'H',  "Lock", "SX exclusive hold",                    "nsec",
208             "lockstat::sx_xunlock:sx-release", NULL,
209             "lockstat::sx_xlock:sx-acquire" },
210         { 'H',  "Lock", "Unknown event (type 38)",              "units" },
211         { 'H',  "Lock", "Unknown event (type 39)",              "units" },
212         { 'H',  "Lock", "Unknown event (type 40)",              "units" },
213         { 'H',  "Lock", "Unknown event (type 41)",              "units" },
214         { 'H',  "Lock", "Unknown event (type 42)",              "units" },
215         { 'H',  "Lock", "Unknown event (type 43)",              "units" },
216         { 'H',  "Lock", "Unknown event (type 44)",              "units" },
217         { 'H',  "Lock", "Unknown event (type 45)",              "units" },
218         { 'H',  "Lock", "Unknown event (type 46)",              "units" },
219         { 'H',  "Lock", "Unknown event (type 47)",              "units" },
220         { 'H',  "Lock", "Unknown event (type 48)",              "units" },
221         { 'H',  "Lock", "Unknown event (type 49)",              "units" },
222         { 'H',  "Lock", "Unknown event (type 50)",              "units" },
223         { 'H',  "Lock", "Unknown event (type 51)",              "units" },
224         { 'H',  "Lock", "Unknown event (type 52)",              "units" },
225         { 'H',  "Lock", "Unknown event (type 53)",              "units" },
226         { 'H',  "Lock", "Unknown event (type 54)",              "units" },
227         { 'H',  "Lock", "Unknown event (type 55)",              "units" },
228 #ifdef illumos
229         { 'I',  "CPU+PIL", "Profiling interrupt",               "nsec",
230 #else
231         { 'I',  "CPU+Pri_Class", "Profiling interrupt",         "nsec",
232 #endif
233             "profile:::profile-97", NULL },
234         { 'I',  "Lock", "Unknown event (type 57)",              "units" },
235         { 'I',  "Lock", "Unknown event (type 58)",              "units" },
236         { 'I',  "Lock", "Unknown event (type 59)",              "units" },
237         { 'E',  "Lock", "Recursive lock entry detected",        "(N/A)",
238             "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
239         { 'E',  "Lock", "Lockstat enter failure",               "(N/A)" },
240         { 'E',  "Lock", "Lockstat exit failure",                "nsec"  },
241         { 'E',  "Lock", "Lockstat record failure",              "(N/A)" },
242 };
243
244 #ifndef illumos
245 static char *g_pri_class[] = {
246         "",
247         "Intr",
248         "RealT",
249         "TShar",
250         "Idle"
251 };
252 #endif
253
254 static void
255 fail(int do_perror, const char *message, ...)
256 {
257         va_list args;
258         int save_errno = errno;
259
260         va_start(args, message);
261         (void) fprintf(stderr, "lockstat: ");
262         (void) vfprintf(stderr, message, args);
263         va_end(args);
264         if (do_perror)
265                 (void) fprintf(stderr, ": %s", strerror(save_errno));
266         (void) fprintf(stderr, "\n");
267         exit(2);
268 }
269
270 static void
271 dfail(const char *message, ...)
272 {
273         va_list args;
274
275         va_start(args, message);
276         (void) fprintf(stderr, "lockstat: ");
277         (void) vfprintf(stderr, message, args);
278         va_end(args);
279         (void) fprintf(stderr, ": %s\n",
280             dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
281
282         exit(2);
283 }
284
285 static void
286 show_events(char event_type, char *desc)
287 {
288         int i, first = -1, last;
289
290         for (i = 0; i < LS_MAX_EVENTS; i++) {
291                 ls_event_info_t *evp = &g_event_info[i];
292                 if (evp->ev_type != event_type ||
293                     strncmp(evp->ev_desc, "Unknown event", 13) == 0)
294                         continue;
295                 if (first == -1)
296                         first = i;
297                 last = i;
298         }
299
300         (void) fprintf(stderr,
301             "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
302             desc, event_type, first, last);
303
304         for (i = first; i <= last; i++)
305                 (void) fprintf(stderr,
306                     "%4d = %s\n", i, g_event_info[i].ev_desc);
307 }
308
309 static void
310 usage(void)
311 {
312         (void) fprintf(stderr,
313             "Usage: lockstat [options] command [args]\n"
314             "\nEvent selection options:\n\n"
315             "  -C              watch contention events [on by default]\n"
316             "  -E              watch error events [off by default]\n"
317             "  -H              watch hold events [off by default]\n"
318             "  -I              watch interrupt events [off by default]\n"
319             "  -A              watch all lock events [equivalent to -CH]\n"
320             "  -e event_list   only watch the specified events (shown below);\n"
321             "                  <event_list> is a comma-separated list of\n"
322             "                  events or ranges of events, e.g. 1,4-7,35\n"
323             "  -i rate         interrupt rate for -I [default: %d Hz]\n"
324             "\nData gathering options:\n\n"
325             "  -b              basic statistics (lock, caller, event count)\n"
326             "  -t              timing for all events [default]\n"
327             "  -h              histograms for event times\n"
328             "  -s depth        stack traces <depth> deep\n"
329             "  -x opt[=val]    enable or modify DTrace options\n"
330             "\nData filtering options:\n\n"
331             "  -n nrecords     maximum number of data records [default: %d]\n"
332             "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
333             "                  symbolic name or hex address; <size> defaults\n"
334             "                  to the ELF symbol size if available, 1 if not\n"
335             "  -f func[,size]  only watch events generated by <func>\n"
336             "  -d duration     only watch events longer than <duration>\n"
337             "  -T              trace (rather than sample) events\n"
338             "\nData reporting options:\n\n"
339             "  -c              coalesce lock data for arrays like pse_mutex[]\n"
340             "  -k              coalesce PCs within functions\n"
341             "  -g              show total events generated by function\n"
342             "  -w              wherever: don't distinguish events by caller\n"
343             "  -W              whichever: don't distinguish events by lock\n"
344             "  -R              display rates rather than counts\n"
345             "  -p              parsable output format (awk(1)-friendly)\n"
346             "  -P              sort lock data by (count * avg_time) product\n"
347             "  -D n            only display top <n> events of each type\n"
348             "  -o filename     send output to <filename>\n",
349             DEFAULT_HZ, DEFAULT_NRECS);
350
351         show_events('C', "Contention");
352         show_events('H', "Hold-time");
353         show_events('I', "Interrupt");
354         show_events('E', "Error");
355         (void) fprintf(stderr, "\n");
356
357         exit(1);
358 }
359
360 static int
361 lockcmp(lsrec_t *a, lsrec_t *b)
362 {
363         int i;
364
365         if (a->ls_event < b->ls_event)
366                 return (-1);
367         if (a->ls_event > b->ls_event)
368                 return (1);
369
370         for (i = g_stkdepth - 1; i >= 0; i--) {
371                 if (a->ls_stack[i] < b->ls_stack[i])
372                         return (-1);
373                 if (a->ls_stack[i] > b->ls_stack[i])
374                         return (1);
375         }
376
377         if (a->ls_caller < b->ls_caller)
378                 return (-1);
379         if (a->ls_caller > b->ls_caller)
380                 return (1);
381
382         if (a->ls_lock < b->ls_lock)
383                 return (-1);
384         if (a->ls_lock > b->ls_lock)
385                 return (1);
386
387         return (0);
388 }
389
390 static int
391 countcmp(lsrec_t *a, lsrec_t *b)
392 {
393         if (a->ls_event < b->ls_event)
394                 return (-1);
395         if (a->ls_event > b->ls_event)
396                 return (1);
397
398         return (b->ls_count - a->ls_count);
399 }
400
401 static int
402 timecmp(lsrec_t *a, lsrec_t *b)
403 {
404         if (a->ls_event < b->ls_event)
405                 return (-1);
406         if (a->ls_event > b->ls_event)
407                 return (1);
408
409         if (a->ls_time < b->ls_time)
410                 return (1);
411         if (a->ls_time > b->ls_time)
412                 return (-1);
413
414         return (0);
415 }
416
417 static int
418 lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
419 {
420         if (a->ls_event < b->ls_event)
421                 return (-1);
422         if (a->ls_event > b->ls_event)
423                 return (1);
424
425         if (a->ls_lock < b->ls_lock)
426                 return (-1);
427         if (a->ls_lock > b->ls_lock)
428                 return (1);
429
430         return (0);
431 }
432
433 static int
434 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
435 {
436         if (a->ls_event < b->ls_event)
437                 return (-1);
438         if (a->ls_event > b->ls_event)
439                 return (1);
440
441         if (a->ls_lock < b->ls_lock)
442                 return (-1);
443         if (a->ls_lock > b->ls_lock)
444                 return (1);
445
446         return (b->ls_count - a->ls_count);
447 }
448
449 static int
450 sitecmp_anylock(lsrec_t *a, lsrec_t *b)
451 {
452         int i;
453
454         if (a->ls_event < b->ls_event)
455                 return (-1);
456         if (a->ls_event > b->ls_event)
457                 return (1);
458
459         for (i = g_stkdepth - 1; i >= 0; i--) {
460                 if (a->ls_stack[i] < b->ls_stack[i])
461                         return (-1);
462                 if (a->ls_stack[i] > b->ls_stack[i])
463                         return (1);
464         }
465
466         if (a->ls_caller < b->ls_caller)
467                 return (-1);
468         if (a->ls_caller > b->ls_caller)
469                 return (1);
470
471         return (0);
472 }
473
474 static int
475 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
476 {
477         int i;
478
479         if (a->ls_event < b->ls_event)
480                 return (-1);
481         if (a->ls_event > b->ls_event)
482                 return (1);
483
484         for (i = g_stkdepth - 1; i >= 0; i--) {
485                 if (a->ls_stack[i] < b->ls_stack[i])
486                         return (-1);
487                 if (a->ls_stack[i] > b->ls_stack[i])
488                         return (1);
489         }
490
491         if (a->ls_caller < b->ls_caller)
492                 return (-1);
493         if (a->ls_caller > b->ls_caller)
494                 return (1);
495
496         return (b->ls_count - a->ls_count);
497 }
498
499 static void
500 lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
501 {
502         int m = n / 2;
503         int i, j;
504
505         if (m > 1)
506                 lsmergesort(cmp, a, b, m);
507         if (n - m > 1)
508                 lsmergesort(cmp, a + m, b + m, n - m);
509         for (i = m; i > 0; i--)
510                 b[i - 1] = a[i - 1];
511         for (j = m - 1; j < n - 1; j++)
512                 b[n + m - j - 2] = a[j + 1];
513         while (i < j)
514                 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
515         *a = b[i];
516 }
517
518 static void
519 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
520 {
521         int i, j;
522         lsrec_t *target, *current;
523
524         target = lock[0];
525
526         for (i = 1; i < n; i++) {
527                 current = lock[i];
528                 if (cmp(current, target) != 0) {
529                         target = current;
530                         continue;
531                 }
532                 current->ls_event = LS_MAX_EVENTS;
533                 target->ls_count += current->ls_count;
534                 target->ls_refcnt += current->ls_refcnt;
535                 if (g_recsize < LS_TIME)
536                         continue;
537                 target->ls_time += current->ls_time;
538                 if (g_recsize < LS_HIST)
539                         continue;
540                 for (j = 0; j < 64; j++)
541                         target->ls_hist[j] += current->ls_hist[j];
542         }
543 }
544
545 static void
546 coalesce_symbol(uintptr_t *addrp)
547 {
548         uintptr_t symoff;
549         size_t symsize;
550
551         if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
552                 *addrp -= symoff;
553 }
554
555 static void
556 predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
557 {
558         char *new;
559         int len, newlen;
560
561         if (what == NULL)
562                 return;
563
564         if (*pred == NULL) {
565                 *pred = malloc(1);
566                 *pred[0] = '\0';
567         }
568
569         len = strlen(*pred);
570         newlen = len + strlen(what) + 32 + strlen("( && )");
571         new = malloc(newlen);
572
573         if (*pred[0] != '\0') {
574                 if (cmp != NULL) {
575                         (void) sprintf(new, "(%s) && (%s %s 0x%p)",
576                             *pred, what, cmp, (void *)value);
577                 } else {
578                         (void) sprintf(new, "(%s) && (%s)", *pred, what);
579                 }
580         } else {
581                 if (cmp != NULL) {
582                         (void) sprintf(new, "%s %s 0x%p",
583                             what, cmp, (void *)value);
584                 } else {
585                         (void) sprintf(new, "%s", what);
586                 }
587         }
588
589         free(*pred);
590         *pred = new;
591 }
592
593 static void
594 predicate_destroy(char **pred)
595 {
596         free(*pred);
597         *pred = NULL;
598 }
599
600 static void
601 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
602 {
603         char buf[256], *c = buf, *new;
604         int len, newlen;
605
606         if (*filt == NULL) {
607                 *filt = malloc(1);
608                 *filt[0] = '\0';
609         }
610
611 #ifdef illumos
612         (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
613             " || " : "", what, (void *)base, what, (void *)(base + size));
614 #else
615         (void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ?
616             " || " : "", what, (void *)base, what, (void *)(base + size));
617 #endif
618
619         newlen = (len = strlen(*filt) + 1) + strlen(c);
620         new = malloc(newlen);
621         bcopy(*filt, new, len);
622         (void) strcat(new, c);
623         free(*filt);
624         *filt = new;
625 }
626
627 static void
628 filter_destroy(char **filt)
629 {
630         free(*filt);
631         *filt = NULL;
632 }
633
634 static void
635 dprog_add(const char *fmt, ...)
636 {
637         va_list args;
638         int size, offs;
639         char c;
640
641         va_start(args, fmt);
642         size = vsnprintf(&c, 1, fmt, args) + 1;
643         va_end(args);
644
645         if (g_proglen == 0) {
646                 offs = 0;
647         } else {
648                 offs = g_proglen - 1;
649         }
650
651         g_proglen = offs + size;
652
653         if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
654                 fail(1, "failed to reallocate program text");
655
656         va_start(args, fmt);
657         (void) vsnprintf(&g_prog[offs], size, fmt, args);
658         va_end(args);
659 }
660
661 /*
662  * This function may read like an open sewer, but keep in mind that programs
663  * that generate other programs are rarely pretty.  If one has the unenviable
664  * task of maintaining or -- worse -- extending this code, use the -V option
665  * to examine the D program as generated by this function.
666  */
667 static void
668 dprog_addevent(int event)
669 {
670         ls_event_info_t *info = &g_event_info[event];
671         char *pred = NULL;
672         char stack[20];
673         const char *arg0, *caller;
674         char *arg1 = "arg1";
675         char buf[80];
676         hrtime_t dur;
677         int depth;
678
679         if (info->ev_name[0] == '\0')
680                 return;
681
682         if (info->ev_type == 'I') {
683                 /*
684                  * For interrupt events, arg0 (normally the lock pointer) is
685                  * the CPU address plus the current pil, and arg1 (normally
686                  * the number of nanoseconds) is the number of nanoseconds
687                  * late -- and it's stored in arg2.
688                  */
689 #ifdef illumos
690                 arg0 = "(uintptr_t)curthread->t_cpu + \n"
691                     "\t    curthread->t_cpu->cpu_profile_pil";
692 #else
693                 arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n"
694                     "\t    0x01000000 + curthread->td_pri_class";
695 #endif
696                 caller = "(uintptr_t)arg0";
697                 arg1 = "arg2";
698         } else {
699                 arg0 = "(uintptr_t)arg0";
700                 caller = "caller";
701         }
702
703         if (g_recsize > LS_HIST) {
704                 for (depth = 0; g_recsize > LS_STACK(depth); depth++)
705                         continue;
706
707                 if (g_tracing) {
708                         (void) sprintf(stack, "\tstack(%d);\n", depth);
709                 } else {
710                         (void) sprintf(stack, ", stack(%d)", depth);
711                 }
712         } else {
713                 (void) sprintf(stack, "");
714         }
715
716         if (info->ev_acquire != NULL) {
717                 /*
718                  * If this is a hold event, we need to generate an additional
719                  * clause for the acquire; the clause for the release will be
720                  * generated with the aggregating statement, below.
721                  */
722                 dprog_add("%s\n", info->ev_acquire);
723                 predicate_add(&pred, info->ev_predicate, NULL, 0);
724                 predicate_add(&pred, g_predicate, NULL, 0);
725                 if (pred != NULL)
726                         dprog_add("/%s/\n", pred);
727
728                 dprog_add("{\n");
729                 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
730
731                 if (info->ev_type == 'H') {
732                         dprog_add("\t%s = timestamp;\n", buf);
733                 } else {
734                         /*
735                          * If this isn't a hold event, it's the recursive
736                          * error event.  For this, we simply bump the
737                          * thread-local, per-lock count.
738                          */
739                         dprog_add("\t%s++;\n", buf);
740                 }
741
742                 dprog_add("}\n\n");
743                 predicate_destroy(&pred);
744                 pred = NULL;
745
746                 if (info->ev_type == 'E') {
747                         /*
748                          * If this is the recursive lock error event, we need
749                          * to generate an additional clause to decrement the
750                          * thread-local, per-lock count.  This assures that we
751                          * only execute the aggregating clause if we have
752                          * recursive entry.
753                          */
754                         dprog_add("%s\n", info->ev_name);
755                         dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
756                 }
757
758                 predicate_add(&pred, buf, NULL, 0);
759
760                 if (info->ev_type == 'H') {
761                         (void) sprintf(buf, "timestamp -\n\t    "
762                             "self->ev%d[(uintptr_t)arg0]", event);
763                 }
764
765                 arg1 = buf;
766         } else {
767                 predicate_add(&pred, info->ev_predicate, NULL, 0);
768                 if (info->ev_type != 'I')
769                         predicate_add(&pred, g_predicate, NULL, 0);
770                 else
771                         predicate_add(&pred, g_ipredicate, NULL, 0);
772         }
773
774         if ((dur = g_min_duration[event]) != 0)
775                 predicate_add(&pred, arg1, ">=", dur);
776
777         dprog_add("%s\n", info->ev_name);
778
779         if (pred != NULL)
780                 dprog_add("/%s/\n", pred);
781         predicate_destroy(&pred);
782
783         dprog_add("{\n");
784
785         if (g_tracing) {
786                 dprog_add("\ttrace(%dULL);\n", event);
787                 dprog_add("\ttrace(%s);\n", arg0);
788                 dprog_add("\ttrace(%s);\n", caller);
789                 dprog_add(stack);
790         } else {
791                 /*
792                  * The ordering here is important:  when we process the
793                  * aggregate, we count on the fact that @avg appears before
794                  * @hist in program order to assure that @avg is assigned the
795                  * first aggregation variable ID and @hist assigned the
796                  * second; see the comment in process_aggregate() for details.
797                  */
798                 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
799                     event, arg0, caller, stack, arg1);
800
801                 if (g_recsize >= LS_HIST) {
802                         dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
803                             "(%s);\n", event, arg0, caller, stack, arg1);
804                 }
805         }
806
807         if (info->ev_acquire != NULL)
808                 dprog_add("\tself->ev%d[arg0] = 0;\n", event);
809
810         dprog_add("}\n\n");
811 }
812
813 static void
814 dprog_compile()
815 {
816         dtrace_prog_t *prog;
817         dtrace_proginfo_t info;
818
819         if (g_Vflag) {
820                 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
821                 (void) fputs(g_prog, stderr);
822                 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
823         }
824
825         if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
826             DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
827                 dfail("failed to compile program");
828
829         if (dtrace_program_exec(g_dtp, prog, &info) == -1)
830                 dfail("failed to enable probes");
831
832         if (dtrace_go(g_dtp) != 0)
833                 dfail("couldn't start tracing");
834 }
835
836 static void
837 #ifdef illumos
838 status_fire(void)
839 #else
840 status_fire(int i)
841 #endif
842 {}
843
844 static void
845 status_init(void)
846 {
847         dtrace_optval_t val, status, agg;
848         struct sigaction act;
849         struct itimerspec ts;
850         struct sigevent ev;
851         timer_t tid;
852
853         if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
854                 dfail("failed to get 'statusrate'");
855
856         if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
857                 dfail("failed to get 'statusrate'");
858
859         /*
860          * We would want to awaken at a rate that is the GCD of the statusrate
861          * and the aggrate -- but that seems a bit absurd.  Instead, we'll
862          * simply awaken at a rate that is the more frequent of the two, which
863          * assures that we're never later than the interval implied by the
864          * more frequent rate.
865          */
866         val = status < agg ? status : agg;
867
868         (void) sigemptyset(&act.sa_mask);
869         act.sa_flags = 0;
870         act.sa_handler = status_fire;
871         (void) sigaction(SIGUSR1, &act, NULL);
872
873         ev.sigev_notify = SIGEV_SIGNAL;
874         ev.sigev_signo = SIGUSR1;
875
876         if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
877                 dfail("cannot create CLOCK_REALTIME timer");
878
879         ts.it_value.tv_sec = val / NANOSEC;
880         ts.it_value.tv_nsec = val % NANOSEC;
881         ts.it_interval = ts.it_value;
882
883         if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
884                 dfail("cannot set time on CLOCK_REALTIME timer");
885 }
886
887 static void
888 status_check(void)
889 {
890         if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
891                 dfail("failed to snap aggregate");
892
893         if (dtrace_status(g_dtp) == -1)
894                 dfail("dtrace_status()");
895 }
896
897 static void
898 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
899 {
900         bzero(lsrec, g_recsize);
901         lsrec->ls_count = 1;
902
903         if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
904                 fail(0, "truncated DTrace record");
905
906         if (rec->dtrd_size != sizeof (uint64_t))
907                 fail(0, "bad event size in first record");
908
909         /* LINTED - alignment */
910         lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
911         rec++;
912
913         if (rec->dtrd_size != sizeof (uintptr_t))
914                 fail(0, "bad lock address size in second record");
915
916         /* LINTED - alignment */
917         lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
918         rec++;
919
920         if (rec->dtrd_size != sizeof (uintptr_t))
921                 fail(0, "bad caller size in third record");
922
923         /* LINTED - alignment */
924         lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
925         rec++;
926
927         if (g_recsize > LS_HIST) {
928                 int frames, i;
929                 pc_t *stack;
930
931                 frames = rec->dtrd_size / sizeof (pc_t);
932                 /* LINTED - alignment */
933                 stack = (pc_t *)(data + rec->dtrd_offset);
934
935                 for (i = 1; i < frames; i++)
936                         lsrec->ls_stack[i - 1] = stack[i];
937         }
938 }
939
940 /*ARGSUSED*/
941 static int
942 count_aggregate(const dtrace_aggdata_t *agg, void *arg)
943 {
944         *((size_t *)arg) += 1;
945
946         return (DTRACE_AGGWALK_NEXT);
947 }
948
949 static int
950 process_aggregate(const dtrace_aggdata_t *agg, void *arg)
951 {
952         const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
953         caddr_t data = agg->dtada_data;
954         lsdata_t *lsdata = arg;
955         lsrec_t *lsrec = lsdata->lsd_next;
956         const dtrace_recdesc_t *rec;
957         uint64_t *avg, *quantized;
958         int i, j;
959
960         assert(lsdata->lsd_count < g_nrecs);
961
962         /*
963          * Aggregation variable IDs are guaranteed to be generated in program
964          * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
965          * plus one.  As "avg" appears before "hist" in program order, we know
966          * that "avg" will be allocated the first aggregation variable ID, and
967          * "hist" will be allocated the second aggregation variable ID -- and
968          * we therefore use the aggregation variable ID to differentiate the
969          * cases.
970          */
971         if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
972                 /*
973                  * If this is the histogram entry.  We'll copy the quantized
974                  * data into lc_hist, and jump over the rest.
975                  */
976                 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
977
978                 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
979                         fail(0, "bad variable ID in aggregation record");
980
981                 if (rec->dtrd_size !=
982                     DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
983                         fail(0, "bad quantize size in aggregation record");
984
985                 /* LINTED - alignment */
986                 quantized = (uint64_t *)(data + rec->dtrd_offset);
987
988                 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
989                     i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
990                         lsrec->ls_hist[j] = quantized[i];
991
992                 goto out;
993         }
994
995         lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
996             aggdesc->dtagd_nrecs - 1, data);
997
998         rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
999
1000         if (rec->dtrd_size != 2 * sizeof (uint64_t))
1001                 fail(0, "bad avg size in aggregation record");
1002
1003         /* LINTED - alignment */
1004         avg = (uint64_t *)(data + rec->dtrd_offset);
1005         lsrec->ls_count = (uint32_t)avg[0];
1006         lsrec->ls_time = (uintptr_t)avg[1];
1007
1008         if (g_recsize >= LS_HIST)
1009                 return (DTRACE_AGGWALK_NEXT);
1010
1011 out:
1012         lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1013         lsdata->lsd_count++;
1014
1015         return (DTRACE_AGGWALK_NEXT);
1016 }
1017
1018 static int
1019 process_trace(const dtrace_probedata_t *pdata, void *arg)
1020 {
1021         lsdata_t *lsdata = arg;
1022         lsrec_t *lsrec = lsdata->lsd_next;
1023         dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
1024         caddr_t data = pdata->dtpda_data;
1025
1026         if (lsdata->lsd_count >= g_nrecs)
1027                 return (DTRACE_CONSUME_NEXT);
1028
1029         lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
1030
1031         lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1032         lsdata->lsd_count++;
1033
1034         return (DTRACE_CONSUME_NEXT);
1035 }
1036
1037 static int
1038 process_data(FILE *out, char *data)
1039 {
1040         lsdata_t lsdata;
1041
1042         /* LINTED - alignment */
1043         lsdata.lsd_next = (lsrec_t *)data;
1044         lsdata.lsd_count = 0;
1045
1046         if (g_tracing) {
1047                 if (dtrace_consume(g_dtp, out,
1048                     process_trace, NULL, &lsdata) != 0)
1049                         dfail("failed to consume buffer");
1050
1051                 return (lsdata.lsd_count);
1052         }
1053
1054         if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1055             process_aggregate, &lsdata) != 0)
1056                 dfail("failed to walk aggregate");
1057
1058         return (lsdata.lsd_count);
1059 }
1060
1061 /*ARGSUSED*/
1062 static int
1063 drophandler(const dtrace_dropdata_t *data, void *arg)
1064 {
1065         g_dropped++;
1066         (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1067         return (DTRACE_HANDLE_OK);
1068 }
1069
1070 int
1071 main(int argc, char **argv)
1072 {
1073         char *data_buf;
1074         lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1075         FILE *out = stdout;
1076         int c;
1077         pid_t child;
1078         int status;
1079         int i, j;
1080         hrtime_t duration;
1081         char *addrp, *offp, *sizep, *evp, *lastp, *p;
1082         uintptr_t addr;
1083         size_t size, off;
1084         int events_specified = 0;
1085         int exec_errno = 0;
1086         uint32_t event;
1087         char *filt = NULL, *ifilt = NULL;
1088         static uint64_t ev_count[LS_MAX_EVENTS + 1];
1089         static uint64_t ev_time[LS_MAX_EVENTS + 1];
1090         dtrace_optval_t aggsize;
1091         char aggstr[10];
1092         long ncpus;
1093         int dynvar = 0;
1094         int err;
1095
1096         if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1097                 fail(0, "cannot open dtrace library: %s",
1098                     dtrace_errmsg(NULL, err));
1099         }
1100
1101         if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1102                 dfail("couldn't establish drop handler");
1103
1104         if (symtab_init() == -1)
1105                 fail(1, "can't load kernel symbols");
1106
1107         g_nrecs = DEFAULT_NRECS;
1108
1109         while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1110                 switch (c) {
1111                 case 'b':
1112                         g_recsize = LS_BASIC;
1113                         break;
1114
1115                 case 't':
1116                         g_recsize = LS_TIME;
1117                         break;
1118
1119                 case 'h':
1120                         g_recsize = LS_HIST;
1121                         break;
1122
1123                 case 's':
1124                         if (!isdigit(optarg[0]))
1125                                 usage();
1126                         g_stkdepth = atoi(optarg);
1127                         if (g_stkdepth > LS_MAX_STACK_DEPTH)
1128                                 fail(0, "max stack depth is %d",
1129                                     LS_MAX_STACK_DEPTH);
1130                         g_recsize = LS_STACK(g_stkdepth);
1131                         break;
1132
1133                 case 'n':
1134                         if (!isdigit(optarg[0]))
1135                                 usage();
1136                         g_nrecs = atoi(optarg);
1137                         break;
1138
1139                 case 'd':
1140                         if (!isdigit(optarg[0]))
1141                                 usage();
1142                         duration = atoll(optarg);
1143
1144                         /*
1145                          * XXX -- durations really should be per event
1146                          * since the units are different, but it's hard
1147                          * to express this nicely in the interface.
1148                          * Not clear yet what the cleanest solution is.
1149                          */
1150                         for (i = 0; i < LS_MAX_EVENTS; i++)
1151                                 if (g_event_info[i].ev_type != 'E')
1152                                         g_min_duration[i] = duration;
1153
1154                         break;
1155
1156                 case 'i':
1157                         if (!isdigit(optarg[0]))
1158                                 usage();
1159                         i = atoi(optarg);
1160                         if (i <= 0)
1161                                 usage();
1162                         if (i > MAX_HZ)
1163                                 fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1164
1165                         for (j = 0; j < LS_MAX_EVENTS; j++)
1166                                 if (strcmp(g_event_info[j].ev_desc,
1167                                     "Profiling interrupt") == 0)
1168                                         break;
1169
1170                         (void) sprintf(g_event_info[j].ev_name,
1171                             "profile:::profile-%d", i);
1172                         break;
1173
1174                 case 'l':
1175                 case 'f':
1176                         addrp = strtok(optarg, ",");
1177                         sizep = strtok(NULL, ",");
1178                         addrp = strtok(optarg, ",+");
1179                         offp = strtok(NULL, ",");
1180
1181                         size = sizep ? strtoul(sizep, NULL, 0) : 1;
1182                         off = offp ? strtoul(offp, NULL, 0) : 0;
1183
1184                         if (addrp[0] == '0') {
1185                                 addr = strtoul(addrp, NULL, 16) + off;
1186                         } else {
1187                                 addr = sym_to_addr(addrp) + off;
1188                                 if (sizep == NULL)
1189                                         size = sym_size(addrp) - off;
1190                                 if (addr - off == 0)
1191                                         fail(0, "symbol '%s' not found", addrp);
1192                                 if (size == 0)
1193                                         size = 1;
1194                         }
1195
1196
1197                         if (c == 'l') {
1198                                 filter_add(&filt, "arg0", addr, size);
1199                         } else {
1200                                 filter_add(&filt, "caller", addr, size);
1201                                 filter_add(&ifilt, "arg0", addr, size);
1202                         }
1203                         break;
1204
1205                 case 'e':
1206                         evp = strtok_r(optarg, ",", &lastp);
1207                         while (evp) {
1208                                 int ev1, ev2;
1209                                 char *evp2;
1210
1211                                 (void) strtok(evp, "-");
1212                                 evp2 = strtok(NULL, "-");
1213                                 ev1 = atoi(evp);
1214                                 ev2 = evp2 ? atoi(evp2) : ev1;
1215                                 if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1216                                     (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1217                                         fail(0, "-e events out of range");
1218                                 for (i = ev1; i <= ev2; i++)
1219                                         g_enabled[i] = 1;
1220                                 evp = strtok_r(NULL, ",", &lastp);
1221                         }
1222                         events_specified = 1;
1223                         break;
1224
1225                 case 'c':
1226                         g_cflag = 1;
1227                         break;
1228
1229                 case 'k':
1230                         g_kflag = 1;
1231                         break;
1232
1233                 case 'w':
1234                         g_wflag = 1;
1235                         break;
1236
1237                 case 'W':
1238                         g_Wflag = 1;
1239                         break;
1240
1241                 case 'g':
1242                         g_gflag = 1;
1243                         break;
1244
1245                 case 'C':
1246                 case 'E':
1247                 case 'H':
1248                 case 'I':
1249                         for (i = 0; i < LS_MAX_EVENTS; i++)
1250                                 if (g_event_info[i].ev_type == c)
1251                                         g_enabled[i] = 1;
1252                         events_specified = 1;
1253                         break;
1254
1255                 case 'A':
1256                         for (i = 0; i < LS_MAX_EVENTS; i++)
1257                                 if (strchr("CH", g_event_info[i].ev_type))
1258                                         g_enabled[i] = 1;
1259                         events_specified = 1;
1260                         break;
1261
1262                 case 'T':
1263                         g_tracing = 1;
1264                         break;
1265
1266                 case 'D':
1267                         if (!isdigit(optarg[0]))
1268                                 usage();
1269                         g_topn = atoi(optarg);
1270                         break;
1271
1272                 case 'R':
1273                         g_rates = 1;
1274                         break;
1275
1276                 case 'p':
1277                         g_pflag = 1;
1278                         break;
1279
1280                 case 'P':
1281                         g_Pflag = 1;
1282                         break;
1283
1284                 case 'o':
1285                         if ((out = fopen(optarg, "w")) == NULL)
1286                                 fail(1, "error opening file");
1287                         break;
1288
1289                 case 'V':
1290                         g_Vflag = 1;
1291                         break;
1292
1293                 default:
1294                         if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1295                                 usage();
1296                 }
1297         }
1298
1299         if (filt != NULL) {
1300                 predicate_add(&g_predicate, filt, NULL, 0);
1301                 filter_destroy(&filt);
1302         }
1303
1304         if (ifilt != NULL) {
1305                 predicate_add(&g_ipredicate, ifilt, NULL, 0);
1306                 filter_destroy(&ifilt);
1307         }
1308
1309         if (g_recsize == 0) {
1310                 if (g_gflag) {
1311                         g_stkdepth = LS_MAX_STACK_DEPTH;
1312                         g_recsize = LS_STACK(g_stkdepth);
1313                 } else {
1314                         g_recsize = LS_TIME;
1315                 }
1316         }
1317
1318         if (g_gflag && g_recsize <= LS_STACK(0))
1319                 fail(0, "'-g' requires at least '-s 1' data gathering");
1320
1321         /*
1322          * Make sure the alignment is reasonable
1323          */
1324         g_recsize = -(-g_recsize & -sizeof (uint64_t));
1325
1326         for (i = 0; i < LS_MAX_EVENTS; i++) {
1327                 /*
1328                  * If no events were specified, enable -C.
1329                  */
1330                 if (!events_specified && g_event_info[i].ev_type == 'C')
1331                         g_enabled[i] = 1;
1332         }
1333
1334         for (i = 0; i < LS_MAX_EVENTS; i++) {
1335                 if (!g_enabled[i])
1336                         continue;
1337
1338                 if (g_event_info[i].ev_acquire != NULL) {
1339                         /*
1340                          * If we've enabled a hold event, we must explicitly
1341                          * allocate dynamic variable space.
1342                          */
1343                         dynvar = 1;
1344                 }
1345
1346                 dprog_addevent(i);
1347         }
1348
1349         /*
1350          * Make sure there are remaining arguments to specify a child command
1351          * to execute.
1352          */
1353         if (argc <= optind)
1354                 usage();
1355
1356         if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1357                 dfail("couldn't determine number of online CPUs");
1358
1359         /*
1360          * By default, we set our data buffer size to be the number of records
1361          * multiplied by the size of the record, doubled to account for some
1362          * DTrace slop and divided by the number of CPUs.  We silently clamp
1363          * the aggregation size at both a minimum and a maximum to prevent
1364          * absurdly low or high values.
1365          */
1366         if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1367                 aggsize = MIN_AGGSIZE;
1368
1369         if (aggsize > MAX_AGGSIZE)
1370                 aggsize = MAX_AGGSIZE;
1371
1372         (void) sprintf(aggstr, "%lld", (long long)aggsize);
1373
1374         if (!g_tracing) {
1375                 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1376                         dfail("failed to set 'bufsize'");
1377
1378                 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1379                         dfail("failed to set 'aggsize'");
1380
1381                 if (dynvar) {
1382                         /*
1383                          * If we're using dynamic variables, we set our
1384                          * dynamic variable size to be one megabyte per CPU,
1385                          * with a hard-limit of 32 megabytes.  This may still
1386                          * be too small in some cases, but it can be tuned
1387                          * manually via -x if need be.
1388                          */
1389                         (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1390
1391                         if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1392                                 dfail("failed to set 'dynvarsize'");
1393                 }
1394         } else {
1395                 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1396                         dfail("failed to set 'bufsize'");
1397         }
1398
1399         if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1400                 dfail("failed to set 'statusrate'");
1401
1402         optind = 1;
1403         while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1404                 switch (c) {
1405                 case 'x':
1406                         if ((p = strchr(optarg, '=')) != NULL)
1407                                 *p++ = '\0';
1408
1409                         if (dtrace_setopt(g_dtp, optarg, p) != 0)
1410                                 dfail("failed to set -x %s", optarg);
1411                         break;
1412                 }
1413         }
1414
1415         argc -= optind;
1416         argv += optind;
1417
1418         dprog_compile();
1419         status_init();
1420
1421         g_elapsed = -gethrtime();
1422
1423         /*
1424          * Spawn the specified command and wait for it to complete.
1425          */
1426         child = fork();
1427         if (child == -1)
1428                 fail(1, "cannot fork");
1429         if (child == 0) {
1430                 (void) dtrace_close(g_dtp);
1431                 (void) execvp(argv[0], &argv[0]);
1432                 exec_errno = errno;
1433                 exit(127);
1434         }
1435
1436 #ifdef illumos
1437         while (waitpid(child, &status, WEXITED) != child)
1438 #else
1439         while (waitpid(child, &status, 0) != child)
1440 #endif
1441                 status_check();
1442
1443         g_elapsed += gethrtime();
1444
1445         if (WIFEXITED(status)) {
1446                 if (WEXITSTATUS(status) != 0) {
1447                         if (exec_errno != 0) {
1448                                 errno = exec_errno;
1449                                 fail(1, "could not execute %s", argv[0]);
1450                         }
1451                         (void) fprintf(stderr,
1452                             "lockstat: warning: %s exited with code %d\n",
1453                             argv[0], WEXITSTATUS(status));
1454                 }
1455         } else {
1456                 (void) fprintf(stderr,
1457                     "lockstat: warning: %s died on signal %d\n",
1458                     argv[0], WTERMSIG(status));
1459         }
1460
1461         if (dtrace_stop(g_dtp) == -1)
1462                 dfail("failed to stop dtrace");
1463
1464         /*
1465          * Before we read out the results, we need to allocate our buffer.
1466          * If we're tracing, then we'll just use the precalculated size.  If
1467          * we're not, then we'll take a snapshot of the aggregate, and walk
1468          * it to count the number of records.
1469          */
1470         if (!g_tracing) {
1471                 if (dtrace_aggregate_snap(g_dtp) != 0)
1472                         dfail("failed to snap aggregate");
1473
1474                 g_nrecs = 0;
1475
1476                 if (dtrace_aggregate_walk(g_dtp,
1477                     count_aggregate, &g_nrecs) != 0)
1478                         dfail("failed to walk aggregate");
1479         }
1480
1481 #ifdef illumos
1482         if ((data_buf = memalign(sizeof (uint64_t),
1483             (g_nrecs + 1) * g_recsize)) == NULL)
1484 #else
1485         if (posix_memalign((void **)&data_buf, sizeof (uint64_t),  
1486             (g_nrecs + 1) * g_recsize) )
1487 #endif
1488                 fail(1, "Memory allocation failed");
1489
1490         /*
1491          * Read out the DTrace data.
1492          */
1493         g_nrecs_used = process_data(out, data_buf);
1494
1495         if (g_nrecs_used > g_nrecs || g_dropped)
1496                 (void) fprintf(stderr, "lockstat: warning: "
1497                     "ran out of data records (use -n for more)\n");
1498
1499         /* LINTED - alignment */
1500         for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1501             /* LINTED - alignment */
1502             lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1503                 ev_count[lsp->ls_event] += lsp->ls_count;
1504                 ev_time[lsp->ls_event] += lsp->ls_time;
1505         }
1506
1507         /*
1508          * If -g was specified, convert stacks into individual records.
1509          */
1510         if (g_gflag) {
1511                 lsrec_t *newlsp, *oldlsp;
1512
1513 #ifdef illumos
1514                 newlsp = memalign(sizeof (uint64_t),
1515                     g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1516 #else
1517                 posix_memalign((void **)&newlsp, sizeof (uint64_t), 
1518                     g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1519 #endif
1520                 if (newlsp == NULL)
1521                         fail(1, "Cannot allocate space for -g processing");
1522                 lsp = newlsp;
1523                 /* LINTED - alignment */
1524                 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1525                     /* LINTED - alignment */
1526                     oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1527                         int fr;
1528                         int caller_in_stack = 0;
1529
1530                         if (oldlsp->ls_count == 0)
1531                                 continue;
1532
1533                         for (fr = 0; fr < g_stkdepth; fr++) {
1534                                 if (oldlsp->ls_stack[fr] == 0)
1535                                         break;
1536                                 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1537                                         caller_in_stack = 1;
1538                                 bcopy(oldlsp, lsp, LS_TIME);
1539                                 lsp->ls_caller = oldlsp->ls_stack[fr];
1540                                 /* LINTED - alignment */
1541                                 lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1542                         }
1543                         if (!caller_in_stack) {
1544                                 bcopy(oldlsp, lsp, LS_TIME);
1545                                 /* LINTED - alignment */
1546                                 lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1547                         }
1548                 }
1549                 g_nrecs = g_nrecs_used =
1550                     ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1551                 g_recsize = LS_TIME;
1552                 g_stkdepth = 0;
1553                 free(data_buf);
1554                 data_buf = (char *)newlsp;
1555         }
1556
1557         if ((sort_buf = calloc(2 * (g_nrecs + 1),
1558             sizeof (void *))) == NULL)
1559                 fail(1, "Sort buffer allocation failed");
1560         merge_buf = sort_buf + (g_nrecs + 1);
1561
1562         /*
1563          * Build the sort buffer, discarding zero-count records along the way.
1564          */
1565         /* LINTED - alignment */
1566         for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1567             /* LINTED - alignment */
1568             lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1569                 if (lsp->ls_count == 0)
1570                         lsp->ls_event = LS_MAX_EVENTS;
1571                 sort_buf[i] = lsp;
1572         }
1573
1574         if (g_nrecs_used == 0)
1575                 exit(0);
1576
1577         /*
1578          * Add a sentinel after the last record
1579          */
1580         sort_buf[i] = lsp;
1581         lsp->ls_event = LS_MAX_EVENTS;
1582
1583         if (g_tracing) {
1584                 report_trace(out, sort_buf);
1585                 return (0);
1586         }
1587
1588         /*
1589          * Application of -g may have resulted in multiple records
1590          * with the same signature; coalesce them.
1591          */
1592         if (g_gflag) {
1593                 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1594                 coalesce(lockcmp, sort_buf, g_nrecs_used);
1595         }
1596
1597         /*
1598          * Coalesce locks within the same symbol if -c option specified.
1599          * Coalesce PCs within the same function if -k option specified.
1600          */
1601         if (g_cflag || g_kflag) {
1602                 for (i = 0; i < g_nrecs_used; i++) {
1603                         int fr;
1604                         lsp = sort_buf[i];
1605                         if (g_cflag)
1606                                 coalesce_symbol(&lsp->ls_lock);
1607                         if (g_kflag) {
1608                                 for (fr = 0; fr < g_stkdepth; fr++)
1609                                         coalesce_symbol(&lsp->ls_stack[fr]);
1610                                 coalesce_symbol(&lsp->ls_caller);
1611                         }
1612                 }
1613                 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1614                 coalesce(lockcmp, sort_buf, g_nrecs_used);
1615         }
1616
1617         /*
1618          * Coalesce callers if -w option specified
1619          */
1620         if (g_wflag) {
1621                 mergesort(lock_and_count_cmp_anywhere,
1622                     sort_buf, merge_buf, g_nrecs_used);
1623                 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1624         }
1625
1626         /*
1627          * Coalesce locks if -W option specified
1628          */
1629         if (g_Wflag) {
1630                 mergesort(site_and_count_cmp_anylock,
1631                     sort_buf, merge_buf, g_nrecs_used);
1632                 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1633         }
1634
1635         /*
1636          * Sort data by contention count (ls_count) or total time (ls_time),
1637          * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1638          */
1639         if (g_recsize < LS_TIME)
1640                 g_Pflag = 0;
1641
1642         if (g_Pflag)
1643                 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1644         else
1645                 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1646
1647         /*
1648          * Display data by event type
1649          */
1650         first = &sort_buf[0];
1651         while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1652                 current = first;
1653                 while ((lsp = *current)->ls_event == event)
1654                         current++;
1655                 report_stats(out, first, current - first, ev_count[event],
1656                     ev_time[event]);
1657                 first = current;
1658         }
1659
1660         return (0);
1661 }
1662
1663 static char *
1664 format_symbol(char *buf, uintptr_t addr, int show_size)
1665 {
1666         uintptr_t symoff;
1667         char *symname;
1668         size_t symsize;
1669
1670         symname = addr_to_sym(addr, &symoff, &symsize);
1671
1672         if (show_size && symoff == 0)
1673                 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1674         else if (symoff == 0)
1675                 (void) sprintf(buf, "%s", symname);
1676         else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)  /* CPU+PIL */
1677 #ifdef illumos
1678                 (void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1679 #else
1680                 (void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]);
1681 #endif
1682         else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1683                 (void) sprintf(buf, "%s+0x%llx", symname,
1684                     (unsigned long long)symoff);
1685         else
1686                 (void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1687         return (buf);
1688 }
1689
1690 static void
1691 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1692         uint64_t total_time)
1693 {
1694         uint32_t event = sort_buf[0]->ls_event;
1695         lsrec_t *lsp;
1696         double ptotal = 0.0;
1697         double percent;
1698         int i, j, fr;
1699         int displayed;
1700         int first_bin, last_bin, max_bin_count, total_bin_count;
1701         int rectype;
1702         char buf[256];
1703         char lhdr[80], chdr[80];
1704
1705         rectype = g_recsize;
1706
1707         if (g_topn == 0) {
1708                 (void) fprintf(out, "%20llu %s\n",
1709                     g_rates == 0 ? total_count :
1710                     ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1711                     g_event_info[event].ev_desc);
1712                 return;
1713         }
1714
1715         (void) sprintf(lhdr, "%s%s",
1716             g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1717         (void) sprintf(chdr, "%s%s",
1718             g_wflag ? "Hottest " : "", "Caller");
1719
1720         if (!g_pflag)
1721                 (void) fprintf(out,
1722                     "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1723                     g_event_info[event].ev_desc, (double)total_count,
1724                     (double)g_elapsed / NANOSEC,
1725                     (double)total_count * NANOSEC / g_elapsed);
1726
1727         if (!g_pflag && rectype < LS_HIST) {
1728                 (void) sprintf(buf, "%s", g_event_info[event].ev_units);
1729                 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1730                     g_rates ? "ops/s" : "Count",
1731                     g_gflag ? "genr" : "indv",
1732                     "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1733                 (void) fprintf(out, "---------------------------------"
1734                     "----------------------------------------------\n");
1735         }
1736
1737         displayed = 0;
1738         for (i = 0; i < nrecs; i++) {
1739                 lsp = sort_buf[i];
1740
1741                 if (displayed++ >= g_topn)
1742                         break;
1743
1744                 if (g_pflag) {
1745                         int j;
1746
1747                         (void) fprintf(out, "%u %u",
1748                             lsp->ls_event, lsp->ls_count);
1749                         (void) fprintf(out, " %s",
1750                             format_symbol(buf, lsp->ls_lock, g_cflag));
1751                         (void) fprintf(out, " %s",
1752                             format_symbol(buf, lsp->ls_caller, 0));
1753                         (void) fprintf(out, " %f",
1754                             (double)lsp->ls_refcnt / lsp->ls_count);
1755                         if (rectype >= LS_TIME)
1756                                 (void) fprintf(out, " %llu",
1757                                     (unsigned long long)lsp->ls_time);
1758                         if (rectype >= LS_HIST) {
1759                                 for (j = 0; j < 64; j++)
1760                                         (void) fprintf(out, " %u",
1761                                             lsp->ls_hist[j]);
1762                         }
1763                         for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1764                                 if (rectype <= LS_STACK(j) ||
1765                                     lsp->ls_stack[j] == 0)
1766                                         break;
1767                                 (void) fprintf(out, " %s",
1768                                     format_symbol(buf, lsp->ls_stack[j], 0));
1769                         }
1770                         (void) fprintf(out, "\n");
1771                         continue;
1772                 }
1773
1774                 if (rectype >= LS_HIST) {
1775                         (void) fprintf(out, "---------------------------------"
1776                             "----------------------------------------------\n");
1777                         (void) sprintf(buf, "%s",
1778                             g_event_info[event].ev_units);
1779                         (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1780                             g_rates ? "ops/s" : "Count",
1781                             g_gflag ? "genr" : "indv",
1782                             "cuml", "rcnt", buf, lhdr, chdr);
1783                 }
1784
1785                 if (g_Pflag && total_time != 0)
1786                         percent = (lsp->ls_time * 100.00) / total_time;
1787                 else
1788                         percent = (lsp->ls_count * 100.00) / total_count;
1789
1790                 ptotal += percent;
1791
1792                 if (rectype >= LS_TIME)
1793                         (void) sprintf(buf, "%llu",
1794                             (unsigned long long)(lsp->ls_time / lsp->ls_count));
1795                 else
1796                         buf[0] = '\0';
1797
1798                 (void) fprintf(out, "%5llu ",
1799                     g_rates == 0 ? lsp->ls_count :
1800                     ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1801
1802                 (void) fprintf(out, "%3.0f%% ", percent);
1803
1804                 if (g_gflag)
1805                         (void) fprintf(out, "---- ");
1806                 else
1807                         (void) fprintf(out, "%3.0f%% ", ptotal);
1808
1809                 (void) fprintf(out, "%4.2f %8s ",
1810                     (double)lsp->ls_refcnt / lsp->ls_count, buf);
1811
1812                 (void) fprintf(out, "%-22s ",
1813                     format_symbol(buf, lsp->ls_lock, g_cflag));
1814
1815                 (void) fprintf(out, "%-24s\n",
1816                     format_symbol(buf, lsp->ls_caller, 0));
1817
1818                 if (rectype < LS_HIST)
1819                         continue;
1820
1821                 (void) fprintf(out, "\n");
1822                 (void) fprintf(out, "%10s %31s %-9s %-24s\n",
1823                     g_event_info[event].ev_units,
1824                     "------ Time Distribution ------",
1825                     g_rates ? "ops/s" : "count",
1826                     rectype > LS_STACK(0) ? "Stack" : "");
1827
1828                 first_bin = 0;
1829                 while (lsp->ls_hist[first_bin] == 0)
1830                         first_bin++;
1831
1832                 last_bin = 63;
1833                 while (lsp->ls_hist[last_bin] == 0)
1834                         last_bin--;
1835
1836                 max_bin_count = 0;
1837                 total_bin_count = 0;
1838                 for (j = first_bin; j <= last_bin; j++) {
1839                         total_bin_count += lsp->ls_hist[j];
1840                         if (lsp->ls_hist[j] > max_bin_count)
1841                                 max_bin_count = lsp->ls_hist[j];
1842                 }
1843
1844                 /*
1845                  * If we went a few frames below the caller, ignore them
1846                  */
1847                 for (fr = 3; fr > 0; fr--)
1848                         if (lsp->ls_stack[fr] == lsp->ls_caller)
1849                                 break;
1850
1851                 for (j = first_bin; j <= last_bin; j++) {
1852                         uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1853                         (void) fprintf(out, "%10llu |%s%s %-9u ",
1854                             1ULL << j,
1855                             "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1856                             "                              " + depth,
1857                             g_rates == 0 ? lsp->ls_hist[j] :
1858                             (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1859                             g_elapsed));
1860                         if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1861                                 (void) fprintf(out, "\n");
1862                                 continue;
1863                         }
1864                         (void) fprintf(out, "%-24s\n",
1865                             format_symbol(buf, lsp->ls_stack[fr], 0));
1866                         fr++;
1867                 }
1868                 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1869                         (void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1870                             format_symbol(buf, lsp->ls_stack[fr], 0));
1871                         fr++;
1872                 }
1873         }
1874
1875         if (!g_pflag)
1876                 (void) fprintf(out, "---------------------------------"
1877                     "----------------------------------------------\n");
1878
1879         (void) fflush(out);
1880 }
1881
1882 static void
1883 report_trace(FILE *out, lsrec_t **sort_buf)
1884 {
1885         lsrec_t *lsp;
1886         int i, fr;
1887         int rectype;
1888         char buf[256], buf2[256];
1889
1890         rectype = g_recsize;
1891
1892         if (!g_pflag) {
1893                 (void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1894                     "Event", "Time", "Owner", "Lock", "Caller");
1895                 (void) fprintf(out, "---------------------------------"
1896                     "----------------------------------------------\n");
1897         }
1898
1899         for (i = 0; i < g_nrecs_used; i++) {
1900
1901                 lsp = sort_buf[i];
1902
1903                 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1904                         continue;
1905
1906                 (void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1907                     lsp->ls_event, (unsigned long long)lsp->ls_time,
1908                     (void *)lsp->ls_next,
1909                     format_symbol(buf, lsp->ls_lock, 0),
1910                     format_symbol(buf2, lsp->ls_caller, 0));
1911
1912                 if (rectype <= LS_STACK(0))
1913                         continue;
1914
1915                 /*
1916                  * If we went a few frames below the caller, ignore them
1917                  */
1918                 for (fr = 3; fr > 0; fr--)
1919                         if (lsp->ls_stack[fr] == lsp->ls_caller)
1920                                 break;
1921
1922                 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1923                         (void) fprintf(out, "%53s  %-24s\n", "",
1924                             format_symbol(buf, lsp->ls_stack[fr], 0));
1925                         fr++;
1926                 }
1927                 (void) fprintf(out, "\n");
1928         }
1929
1930         (void) fflush(out);
1931 }