]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - cddl/contrib/opensolaris/cmd/lockstat/lockstat.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / cddl / contrib / opensolaris / cmd / lockstat / lockstat.c
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25
26 #pragma ident   "%Z%%M% %I%     %E% SMI"
27
28 #include <stdio.h>
29 #include <stddef.h>
30 #include <stdlib.h>
31 #include <stdarg.h>
32 #include <string.h>
33 #include <strings.h>
34 #include <ctype.h>
35 #include <fcntl.h>
36 #include <unistd.h>
37 #include <errno.h>
38 #include <limits.h>
39 #include <sys/types.h>
40 #include <sys/modctl.h>
41 #include <sys/stat.h>
42 #include <sys/wait.h>
43 #include <dtrace.h>
44 #include <sys/lockstat.h>
45 #include <alloca.h>
46 #include <signal.h>
47 #include <assert.h>
48
49 #if defined(sun)
50 #define GETOPT_EOF      EOF
51 #else
52 /* FreeBSD */ 
53 #include <sys/time.h>
54 #include <sys/resource.h>
55
56 #define mergesort(a, b, c, d)   lsmergesort(a, b, c, d)
57 #define GETOPT_EOF              (-1)
58
59 typedef uintptr_t       pc_t;
60 #endif /* defined(sun) */
61
62 #define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"
63
64 #define LS_MAX_STACK_DEPTH      50
65 #define LS_MAX_EVENTS           64
66
67 typedef struct lsrec {
68         struct lsrec    *ls_next;       /* next in hash chain */
69         uintptr_t       ls_lock;        /* lock address */
70         uintptr_t       ls_caller;      /* caller address */
71         uint32_t        ls_count;       /* cumulative event count */
72         uint32_t        ls_event;       /* type of event */
73         uintptr_t       ls_refcnt;      /* cumulative reference count */
74         uint64_t        ls_time;        /* cumulative event duration */
75         uint32_t        ls_hist[64];    /* log2(duration) histogram */
76         uintptr_t       ls_stack[LS_MAX_STACK_DEPTH];
77 } lsrec_t;
78
79 typedef struct lsdata {
80         struct lsrec    *lsd_next;      /* next available */
81         int             lsd_count;      /* number of records */
82 } lsdata_t;
83
84 /*
85  * Definitions for the types of experiments which can be run.  They are
86  * listed in increasing order of memory cost and processing time cost.
87  * The numerical value of each type is the number of bytes needed per record.
88  */
89 #define LS_BASIC        offsetof(lsrec_t, ls_time)
90 #define LS_TIME         offsetof(lsrec_t, ls_hist[0])
91 #define LS_HIST         offsetof(lsrec_t, ls_stack[0])
92 #define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth])
93
94 static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
95 static void report_trace(FILE *, lsrec_t **);
96
97 extern int symtab_init(void);
98 extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
99 extern uintptr_t sym_to_addr(char *name);
100 extern size_t sym_size(char *name);
101 extern char *strtok_r(char *, const char *, char **);
102
103 #define DEFAULT_NRECS   10000
104 #define DEFAULT_HZ      97
105 #define MAX_HZ          1000
106 #define MIN_AGGSIZE     (16 * 1024)
107 #define MAX_AGGSIZE     (32 * 1024 * 1024)
108
109 static int g_stkdepth;
110 static int g_topn = INT_MAX;
111 static hrtime_t g_elapsed;
112 static int g_rates = 0;
113 static int g_pflag = 0;
114 static int g_Pflag = 0;
115 static int g_wflag = 0;
116 static int g_Wflag = 0;
117 static int g_cflag = 0;
118 static int g_kflag = 0;
119 static int g_gflag = 0;
120 static int g_Vflag = 0;
121 static int g_tracing = 0;
122 static size_t g_recsize;
123 static size_t g_nrecs;
124 static int g_nrecs_used;
125 static uchar_t g_enabled[LS_MAX_EVENTS];
126 static hrtime_t g_min_duration[LS_MAX_EVENTS];
127 static dtrace_hdl_t *g_dtp;
128 static char *g_predicate;
129 static char *g_ipredicate;
130 static char *g_prog;
131 static int g_proglen;
132 static int g_dropped;
133
134 typedef struct ls_event_info {
135         char    ev_type;
136         char    ev_lhdr[20];
137         char    ev_desc[80];
138         char    ev_units[10];
139         char    ev_name[DTRACE_NAMELEN];
140         char    *ev_predicate;
141         char    *ev_acquire;
142 } ls_event_info_t;
143
144 static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
145         { 'C',  "Lock", "Adaptive mutex spin",                  "nsec",
146             "lockstat:::adaptive-spin" },
147         { 'C',  "Lock", "Adaptive mutex block",                 "nsec",
148             "lockstat:::adaptive-block" },
149         { 'C',  "Lock", "Spin lock spin",                       "nsec",
150             "lockstat:::spin-spin" },
151         { 'C',  "Lock", "Thread lock spin",                     "nsec",
152             "lockstat:::thread-spin" },
153         { 'C',  "Lock", "R/W writer blocked by writer",         "nsec",
154             "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
155         { 'C',  "Lock", "R/W writer blocked by readers",        "nsec",
156             "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
157         { 'C',  "Lock", "R/W reader blocked by writer",         "nsec",
158             "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
159         { 'C',  "Lock", "R/W reader blocked by write wanted",   "nsec",
160             "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
161         { 'C',  "Lock", "Unknown event (type 8)",               "units" },
162         { 'C',  "Lock", "Unknown event (type 9)",               "units" },
163         { 'C',  "Lock", "Unknown event (type 10)",              "units" },
164         { 'C',  "Lock", "Unknown event (type 11)",              "units" },
165         { 'C',  "Lock", "Unknown event (type 12)",              "units" },
166         { 'C',  "Lock", "Unknown event (type 13)",              "units" },
167         { 'C',  "Lock", "Unknown event (type 14)",              "units" },
168         { 'C',  "Lock", "Unknown event (type 15)",              "units" },
169         { 'C',  "Lock", "Unknown event (type 16)",              "units" },
170         { 'C',  "Lock", "Unknown event (type 17)",              "units" },
171         { 'C',  "Lock", "Unknown event (type 18)",              "units" },
172         { 'C',  "Lock", "Unknown event (type 19)",              "units" },
173         { 'C',  "Lock", "Unknown event (type 20)",              "units" },
174         { 'C',  "Lock", "Unknown event (type 21)",              "units" },
175         { 'C',  "Lock", "Unknown event (type 22)",              "units" },
176         { 'C',  "Lock", "Unknown event (type 23)",              "units" },
177         { 'C',  "Lock", "Unknown event (type 24)",              "units" },
178         { 'C',  "Lock", "Unknown event (type 25)",              "units" },
179         { 'C',  "Lock", "Unknown event (type 26)",              "units" },
180         { 'C',  "Lock", "Unknown event (type 27)",              "units" },
181         { 'C',  "Lock", "Unknown event (type 28)",              "units" },
182         { 'C',  "Lock", "Unknown event (type 29)",              "units" },
183         { 'C',  "Lock", "Unknown event (type 30)",              "units" },
184         { 'C',  "Lock", "Unknown event (type 31)",              "units" },
185         { 'H',  "Lock", "Adaptive mutex hold",                  "nsec",
186             "lockstat:::adaptive-release", NULL,
187             "lockstat:::adaptive-acquire" },
188         { 'H',  "Lock", "Spin lock hold",                       "nsec",
189             "lockstat:::spin-release", NULL,
190             "lockstat:::spin-acquire" },
191         { 'H',  "Lock", "R/W writer hold",                      "nsec",
192             "lockstat:::rw-release", "arg1 == 0",
193             "lockstat:::rw-acquire" },
194         { 'H',  "Lock", "R/W reader hold",                      "nsec",
195             "lockstat:::rw-release", "arg1 != 0",
196             "lockstat:::rw-acquire" },
197         { 'H',  "Lock", "Unknown event (type 36)",              "units" },
198         { 'H',  "Lock", "Unknown event (type 37)",              "units" },
199         { 'H',  "Lock", "Unknown event (type 38)",              "units" },
200         { 'H',  "Lock", "Unknown event (type 39)",              "units" },
201         { 'H',  "Lock", "Unknown event (type 40)",              "units" },
202         { 'H',  "Lock", "Unknown event (type 41)",              "units" },
203         { 'H',  "Lock", "Unknown event (type 42)",              "units" },
204         { 'H',  "Lock", "Unknown event (type 43)",              "units" },
205         { 'H',  "Lock", "Unknown event (type 44)",              "units" },
206         { 'H',  "Lock", "Unknown event (type 45)",              "units" },
207         { 'H',  "Lock", "Unknown event (type 46)",              "units" },
208         { 'H',  "Lock", "Unknown event (type 47)",              "units" },
209         { 'H',  "Lock", "Unknown event (type 48)",              "units" },
210         { 'H',  "Lock", "Unknown event (type 49)",              "units" },
211         { 'H',  "Lock", "Unknown event (type 50)",              "units" },
212         { 'H',  "Lock", "Unknown event (type 51)",              "units" },
213         { 'H',  "Lock", "Unknown event (type 52)",              "units" },
214         { 'H',  "Lock", "Unknown event (type 53)",              "units" },
215         { 'H',  "Lock", "Unknown event (type 54)",              "units" },
216         { 'H',  "Lock", "Unknown event (type 55)",              "units" },
217 #if defined(sun)
218         { 'I',  "CPU+PIL", "Profiling interrupt",               "nsec",
219 #else
220         /* FreeBSD */
221         { 'I',  "CPU+Pri_Class", "Profiling interrupt",         "nsec",
222 #endif
223             "profile:::profile-97", NULL },
224         { 'I',  "Lock", "Unknown event (type 57)",              "units" },
225         { 'I',  "Lock", "Unknown event (type 58)",              "units" },
226         { 'I',  "Lock", "Unknown event (type 59)",              "units" },
227         { 'E',  "Lock", "Recursive lock entry detected",        "(N/A)",
228             "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
229         { 'E',  "Lock", "Lockstat enter failure",               "(N/A)" },
230         { 'E',  "Lock", "Lockstat exit failure",                "nsec"  },
231         { 'E',  "Lock", "Lockstat record failure",              "(N/A)" },
232 };
233
234 #if !defined(sun)
235 static char *g_pri_class[] = {
236         "",
237         "Intr",
238         "RealT",
239         "TShar",
240         "Idle"
241 };
242 #endif
243
244 static void
245 fail(int do_perror, const char *message, ...)
246 {
247         va_list args;
248         int save_errno = errno;
249
250         va_start(args, message);
251         (void) fprintf(stderr, "lockstat: ");
252         (void) vfprintf(stderr, message, args);
253         va_end(args);
254         if (do_perror)
255                 (void) fprintf(stderr, ": %s", strerror(save_errno));
256         (void) fprintf(stderr, "\n");
257         exit(2);
258 }
259
260 static void
261 dfail(const char *message, ...)
262 {
263         va_list args;
264
265         va_start(args, message);
266         (void) fprintf(stderr, "lockstat: ");
267         (void) vfprintf(stderr, message, args);
268         va_end(args);
269         (void) fprintf(stderr, ": %s\n",
270             dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));
271
272         exit(2);
273 }
274
275 static void
276 show_events(char event_type, char *desc)
277 {
278         int i, first = -1, last;
279
280         for (i = 0; i < LS_MAX_EVENTS; i++) {
281                 ls_event_info_t *evp = &g_event_info[i];
282                 if (evp->ev_type != event_type ||
283                     strncmp(evp->ev_desc, "Unknown event", 13) == 0)
284                         continue;
285                 if (first == -1)
286                         first = i;
287                 last = i;
288         }
289
290         (void) fprintf(stderr,
291             "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
292             desc, event_type, first, last);
293
294         for (i = first; i <= last; i++)
295                 (void) fprintf(stderr,
296                     "%4d = %s\n", i, g_event_info[i].ev_desc);
297 }
298
299 static void
300 usage(void)
301 {
302         (void) fprintf(stderr,
303             "Usage: lockstat [options] command [args]\n"
304             "\nEvent selection options:\n\n"
305             "  -C              watch contention events [on by default]\n"
306             "  -E              watch error events [off by default]\n"
307             "  -H              watch hold events [off by default]\n"
308             "  -I              watch interrupt events [off by default]\n"
309             "  -A              watch all lock events [equivalent to -CH]\n"
310             "  -e event_list   only watch the specified events (shown below);\n"
311             "                  <event_list> is a comma-separated list of\n"
312             "                  events or ranges of events, e.g. 1,4-7,35\n"
313             "  -i rate         interrupt rate for -I [default: %d Hz]\n"
314             "\nData gathering options:\n\n"
315             "  -b              basic statistics (lock, caller, event count)\n"
316             "  -t              timing for all events [default]\n"
317             "  -h              histograms for event times\n"
318             "  -s depth        stack traces <depth> deep\n"
319             "  -x opt[=val]    enable or modify DTrace options\n"
320             "\nData filtering options:\n\n"
321             "  -n nrecords     maximum number of data records [default: %d]\n"
322             "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
323             "                  symbolic name or hex address; <size> defaults\n"
324             "                  to the ELF symbol size if available, 1 if not\n"
325             "  -f func[,size]  only watch events generated by <func>\n"
326             "  -d duration     only watch events longer than <duration>\n"
327             "  -T              trace (rather than sample) events\n"
328             "\nData reporting options:\n\n"
329             "  -c              coalesce lock data for arrays like pse_mutex[]\n"
330             "  -k              coalesce PCs within functions\n"
331             "  -g              show total events generated by function\n"
332             "  -w              wherever: don't distinguish events by caller\n"
333             "  -W              whichever: don't distinguish events by lock\n"
334             "  -R              display rates rather than counts\n"
335             "  -p              parsable output format (awk(1)-friendly)\n"
336             "  -P              sort lock data by (count * avg_time) product\n"
337             "  -D n            only display top <n> events of each type\n"
338             "  -o filename     send output to <filename>\n",
339             DEFAULT_HZ, DEFAULT_NRECS);
340
341         show_events('C', "Contention");
342         show_events('H', "Hold-time");
343         show_events('I', "Interrupt");
344         show_events('E', "Error");
345         (void) fprintf(stderr, "\n");
346
347         exit(1);
348 }
349
350 static int
351 lockcmp(lsrec_t *a, lsrec_t *b)
352 {
353         int i;
354
355         if (a->ls_event < b->ls_event)
356                 return (-1);
357         if (a->ls_event > b->ls_event)
358                 return (1);
359
360         for (i = g_stkdepth - 1; i >= 0; i--) {
361                 if (a->ls_stack[i] < b->ls_stack[i])
362                         return (-1);
363                 if (a->ls_stack[i] > b->ls_stack[i])
364                         return (1);
365         }
366
367         if (a->ls_caller < b->ls_caller)
368                 return (-1);
369         if (a->ls_caller > b->ls_caller)
370                 return (1);
371
372         if (a->ls_lock < b->ls_lock)
373                 return (-1);
374         if (a->ls_lock > b->ls_lock)
375                 return (1);
376
377         return (0);
378 }
379
380 static int
381 countcmp(lsrec_t *a, lsrec_t *b)
382 {
383         if (a->ls_event < b->ls_event)
384                 return (-1);
385         if (a->ls_event > b->ls_event)
386                 return (1);
387
388         return (b->ls_count - a->ls_count);
389 }
390
391 static int
392 timecmp(lsrec_t *a, lsrec_t *b)
393 {
394         if (a->ls_event < b->ls_event)
395                 return (-1);
396         if (a->ls_event > b->ls_event)
397                 return (1);
398
399         if (a->ls_time < b->ls_time)
400                 return (1);
401         if (a->ls_time > b->ls_time)
402                 return (-1);
403
404         return (0);
405 }
406
407 static int
408 lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
409 {
410         if (a->ls_event < b->ls_event)
411                 return (-1);
412         if (a->ls_event > b->ls_event)
413                 return (1);
414
415         if (a->ls_lock < b->ls_lock)
416                 return (-1);
417         if (a->ls_lock > b->ls_lock)
418                 return (1);
419
420         return (0);
421 }
422
423 static int
424 lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
425 {
426         if (a->ls_event < b->ls_event)
427                 return (-1);
428         if (a->ls_event > b->ls_event)
429                 return (1);
430
431         if (a->ls_lock < b->ls_lock)
432                 return (-1);
433         if (a->ls_lock > b->ls_lock)
434                 return (1);
435
436         return (b->ls_count - a->ls_count);
437 }
438
439 static int
440 sitecmp_anylock(lsrec_t *a, lsrec_t *b)
441 {
442         int i;
443
444         if (a->ls_event < b->ls_event)
445                 return (-1);
446         if (a->ls_event > b->ls_event)
447                 return (1);
448
449         for (i = g_stkdepth - 1; i >= 0; i--) {
450                 if (a->ls_stack[i] < b->ls_stack[i])
451                         return (-1);
452                 if (a->ls_stack[i] > b->ls_stack[i])
453                         return (1);
454         }
455
456         if (a->ls_caller < b->ls_caller)
457                 return (-1);
458         if (a->ls_caller > b->ls_caller)
459                 return (1);
460
461         return (0);
462 }
463
464 static int
465 site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
466 {
467         int i;
468
469         if (a->ls_event < b->ls_event)
470                 return (-1);
471         if (a->ls_event > b->ls_event)
472                 return (1);
473
474         for (i = g_stkdepth - 1; i >= 0; i--) {
475                 if (a->ls_stack[i] < b->ls_stack[i])
476                         return (-1);
477                 if (a->ls_stack[i] > b->ls_stack[i])
478                         return (1);
479         }
480
481         if (a->ls_caller < b->ls_caller)
482                 return (-1);
483         if (a->ls_caller > b->ls_caller)
484                 return (1);
485
486         return (b->ls_count - a->ls_count);
487 }
488
489 static void
490 lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
491 {
492         int m = n / 2;
493         int i, j;
494
495         if (m > 1)
496                 lsmergesort(cmp, a, b, m);
497         if (n - m > 1)
498                 lsmergesort(cmp, a + m, b + m, n - m);
499         for (i = m; i > 0; i--)
500                 b[i - 1] = a[i - 1];
501         for (j = m - 1; j < n - 1; j++)
502                 b[n + m - j - 2] = a[j + 1];
503         while (i < j)
504                 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
505         *a = b[i];
506 }
507
508 static void
509 coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
510 {
511         int i, j;
512         lsrec_t *target, *current;
513
514         target = lock[0];
515
516         for (i = 1; i < n; i++) {
517                 current = lock[i];
518                 if (cmp(current, target) != 0) {
519                         target = current;
520                         continue;
521                 }
522                 current->ls_event = LS_MAX_EVENTS;
523                 target->ls_count += current->ls_count;
524                 target->ls_refcnt += current->ls_refcnt;
525                 if (g_recsize < LS_TIME)
526                         continue;
527                 target->ls_time += current->ls_time;
528                 if (g_recsize < LS_HIST)
529                         continue;
530                 for (j = 0; j < 64; j++)
531                         target->ls_hist[j] += current->ls_hist[j];
532         }
533 }
534
535 static void
536 coalesce_symbol(uintptr_t *addrp)
537 {
538         uintptr_t symoff;
539         size_t symsize;
540
541         if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
542                 *addrp -= symoff;
543 }
544
545 static void
546 predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
547 {
548         char *new;
549         int len, newlen;
550
551         if (what == NULL)
552                 return;
553
554         if (*pred == NULL) {
555                 *pred = malloc(1);
556                 *pred[0] = '\0';
557         }
558
559         len = strlen(*pred);
560         newlen = len + strlen(what) + 32 + strlen("( && )");
561         new = malloc(newlen);
562
563         if (*pred[0] != '\0') {
564                 if (cmp != NULL) {
565                         (void) sprintf(new, "(%s) && (%s %s 0x%p)",
566                             *pred, what, cmp, (void *)value);
567                 } else {
568                         (void) sprintf(new, "(%s) && (%s)", *pred, what);
569                 }
570         } else {
571                 if (cmp != NULL) {
572                         (void) sprintf(new, "%s %s 0x%p",
573                             what, cmp, (void *)value);
574                 } else {
575                         (void) sprintf(new, "%s", what);
576                 }
577         }
578
579         free(*pred);
580         *pred = new;
581 }
582
583 static void
584 predicate_destroy(char **pred)
585 {
586         free(*pred);
587         *pred = NULL;
588 }
589
590 static void
591 filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
592 {
593         char buf[256], *c = buf, *new;
594         int len, newlen;
595
596         if (*filt == NULL) {
597                 *filt = malloc(1);
598                 *filt[0] = '\0';
599         }
600
601 #if defined(sun)
602         (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
603             " || " : "", what, (void *)base, what, (void *)(base + size));
604 #else
605         (void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ?
606             " || " : "", what, (void *)base, what, (void *)(base + size));
607 #endif
608
609         newlen = (len = strlen(*filt) + 1) + strlen(c);
610         new = malloc(newlen);
611         bcopy(*filt, new, len);
612         (void) strcat(new, c);
613         free(*filt);
614         *filt = new;
615 }
616
617 static void
618 filter_destroy(char **filt)
619 {
620         free(*filt);
621         *filt = NULL;
622 }
623
624 static void
625 dprog_add(const char *fmt, ...)
626 {
627         va_list args;
628         int size, offs;
629         char c;
630
631         va_start(args, fmt);
632         size = vsnprintf(&c, 1, fmt, args) + 1;
633         va_end(args);
634
635         if (g_proglen == 0) {
636                 offs = 0;
637         } else {
638                 offs = g_proglen - 1;
639         }
640
641         g_proglen = offs + size;
642
643         if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
644                 fail(1, "failed to reallocate program text");
645
646         va_start(args, fmt);
647         (void) vsnprintf(&g_prog[offs], size, fmt, args);
648         va_end(args);
649 }
650
651 /*
652  * This function may read like an open sewer, but keep in mind that programs
653  * that generate other programs are rarely pretty.  If one has the unenviable
654  * task of maintaining or -- worse -- extending this code, use the -V option
655  * to examine the D program as generated by this function.
656  */
657 static void
658 dprog_addevent(int event)
659 {
660         ls_event_info_t *info = &g_event_info[event];
661         char *pred = NULL;
662         char stack[20];
663         const char *arg0, *caller;
664         char *arg1 = "arg1";
665         char buf[80];
666         hrtime_t dur;
667         int depth;
668
669         if (info->ev_name[0] == '\0')
670                 return;
671
672         if (info->ev_type == 'I') {
673                 /*
674                  * For interrupt events, arg0 (normally the lock pointer) is
675                  * the CPU address plus the current pil, and arg1 (normally
676                  * the number of nanoseconds) is the number of nanoseconds
677                  * late -- and it's stored in arg2.
678                  */
679 #if defined(sun)
680                 arg0 = "(uintptr_t)curthread->t_cpu + \n"
681                     "\t    curthread->t_cpu->cpu_profile_pil";
682 #else
683                 arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n"
684                     "\t    0x01000000 + curthread->td_pri_class";
685 #endif
686                 caller = "(uintptr_t)arg0";
687                 arg1 = "arg2";
688         } else {
689                 arg0 = "(uintptr_t)arg0";
690                 caller = "caller";
691         }
692
693         if (g_recsize > LS_HIST) {
694                 for (depth = 0; g_recsize > LS_STACK(depth); depth++)
695                         continue;
696
697                 if (g_tracing) {
698                         (void) sprintf(stack, "\tstack(%d);\n", depth);
699                 } else {
700                         (void) sprintf(stack, ", stack(%d)", depth);
701                 }
702         } else {
703                 (void) sprintf(stack, "");
704         }
705
706         if (info->ev_acquire != NULL) {
707                 /*
708                  * If this is a hold event, we need to generate an additional
709                  * clause for the acquire; the clause for the release will be
710                  * generated with the aggregating statement, below.
711                  */
712                 dprog_add("%s\n", info->ev_acquire);
713                 predicate_add(&pred, info->ev_predicate, NULL, 0);
714                 predicate_add(&pred, g_predicate, NULL, 0);
715                 if (pred != NULL)
716                         dprog_add("/%s/\n", pred);
717
718                 dprog_add("{\n");
719                 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);
720
721                 if (info->ev_type == 'H') {
722                         dprog_add("\t%s = timestamp;\n", buf);
723                 } else {
724                         /*
725                          * If this isn't a hold event, it's the recursive
726                          * error event.  For this, we simply bump the
727                          * thread-local, per-lock count.
728                          */
729                         dprog_add("\t%s++;\n", buf);
730                 }
731
732                 dprog_add("}\n\n");
733                 predicate_destroy(&pred);
734                 pred = NULL;
735
736                 if (info->ev_type == 'E') {
737                         /*
738                          * If this is the recursive lock error event, we need
739                          * to generate an additional clause to decrement the
740                          * thread-local, per-lock count.  This assures that we
741                          * only execute the aggregating clause if we have
742                          * recursive entry.
743                          */
744                         dprog_add("%s\n", info->ev_name);
745                         dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
746                 }
747
748                 predicate_add(&pred, buf, NULL, 0);
749
750                 if (info->ev_type == 'H') {
751                         (void) sprintf(buf, "timestamp -\n\t    "
752                             "self->ev%d[(uintptr_t)arg0]", event);
753                 }
754
755                 arg1 = buf;
756         } else {
757                 predicate_add(&pred, info->ev_predicate, NULL, 0);
758                 if (info->ev_type != 'I')
759                         predicate_add(&pred, g_predicate, NULL, 0);
760                 else
761                         predicate_add(&pred, g_ipredicate, NULL, 0);
762         }
763
764         if ((dur = g_min_duration[event]) != 0)
765                 predicate_add(&pred, arg1, ">=", dur);
766
767         dprog_add("%s\n", info->ev_name);
768
769         if (pred != NULL)
770                 dprog_add("/%s/\n", pred);
771         predicate_destroy(&pred);
772
773         dprog_add("{\n");
774
775         if (g_tracing) {
776                 dprog_add("\ttrace(%dULL);\n", event);
777                 dprog_add("\ttrace(%s);\n", arg0);
778                 dprog_add("\ttrace(%s);\n", caller);
779                 dprog_add(stack);
780         } else {
781                 /*
782                  * The ordering here is important:  when we process the
783                  * aggregate, we count on the fact that @avg appears before
784                  * @hist in program order to assure that @avg is assigned the
785                  * first aggregation variable ID and @hist assigned the
786                  * second; see the comment in process_aggregate() for details.
787                  */
788                 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
789                     event, arg0, caller, stack, arg1);
790
791                 if (g_recsize >= LS_HIST) {
792                         dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
793                             "(%s);\n", event, arg0, caller, stack, arg1);
794                 }
795         }
796
797         if (info->ev_acquire != NULL)
798                 dprog_add("\tself->ev%d[arg0] = 0;\n", event);
799
800         dprog_add("}\n\n");
801 }
802
803 static void
804 dprog_compile()
805 {
806         dtrace_prog_t *prog;
807         dtrace_proginfo_t info;
808
809         if (g_Vflag) {
810                 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
811                 (void) fputs(g_prog, stderr);
812                 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
813         }
814
815         if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
816             DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
817                 dfail("failed to compile program");
818
819         if (dtrace_program_exec(g_dtp, prog, &info) == -1)
820                 dfail("failed to enable probes");
821
822         if (dtrace_go(g_dtp) != 0)
823                 dfail("couldn't start tracing");
824 }
825
826 static void
827 #if defined(sun)
828 status_fire(void)
829 #else
830 status_fire(int i)
831 #endif
832 {}
833
834 static void
835 status_init(void)
836 {
837         dtrace_optval_t val, status, agg;
838         struct sigaction act;
839         struct itimerspec ts;
840         struct sigevent ev;
841         timer_t tid;
842
843         if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
844                 dfail("failed to get 'statusrate'");
845
846         if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
847                 dfail("failed to get 'statusrate'");
848
849         /*
850          * We would want to awaken at a rate that is the GCD of the statusrate
851          * and the aggrate -- but that seems a bit absurd.  Instead, we'll
852          * simply awaken at a rate that is the more frequent of the two, which
853          * assures that we're never later than the interval implied by the
854          * more frequent rate.
855          */
856         val = status < agg ? status : agg;
857
858         (void) sigemptyset(&act.sa_mask);
859         act.sa_flags = 0;
860         act.sa_handler = status_fire;
861         (void) sigaction(SIGUSR1, &act, NULL);
862
863         ev.sigev_notify = SIGEV_SIGNAL;
864         ev.sigev_signo = SIGUSR1;
865
866         if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
867                 dfail("cannot create CLOCK_REALTIME timer");
868
869         ts.it_value.tv_sec = val / NANOSEC;
870         ts.it_value.tv_nsec = val % NANOSEC;
871         ts.it_interval = ts.it_value;
872
873         if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
874                 dfail("cannot set time on CLOCK_REALTIME timer");
875 }
876
877 static void
878 status_check(void)
879 {
880         if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
881                 dfail("failed to snap aggregate");
882
883         if (dtrace_status(g_dtp) == -1)
884                 dfail("dtrace_status()");
885 }
886
887 static void
888 lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
889 {
890         bzero(lsrec, g_recsize);
891         lsrec->ls_count = 1;
892
893         if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
894                 fail(0, "truncated DTrace record");
895
896         if (rec->dtrd_size != sizeof (uint64_t))
897                 fail(0, "bad event size in first record");
898
899         /* LINTED - alignment */
900         lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
901         rec++;
902
903         if (rec->dtrd_size != sizeof (uintptr_t))
904                 fail(0, "bad lock address size in second record");
905
906         /* LINTED - alignment */
907         lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
908         rec++;
909
910         if (rec->dtrd_size != sizeof (uintptr_t))
911                 fail(0, "bad caller size in third record");
912
913         /* LINTED - alignment */
914         lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
915         rec++;
916
917         if (g_recsize > LS_HIST) {
918                 int frames, i;
919                 pc_t *stack;
920
921                 frames = rec->dtrd_size / sizeof (pc_t);
922                 /* LINTED - alignment */
923                 stack = (pc_t *)(data + rec->dtrd_offset);
924
925                 for (i = 1; i < frames; i++)
926                         lsrec->ls_stack[i - 1] = stack[i];
927         }
928 }
929
930 /*ARGSUSED*/
931 static int
932 count_aggregate(const dtrace_aggdata_t *agg, void *arg)
933 {
934         *((size_t *)arg) += 1;
935
936         return (DTRACE_AGGWALK_NEXT);
937 }
938
939 static int
940 process_aggregate(const dtrace_aggdata_t *agg, void *arg)
941 {
942         const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
943         caddr_t data = agg->dtada_data;
944         lsdata_t *lsdata = arg;
945         lsrec_t *lsrec = lsdata->lsd_next;
946         const dtrace_recdesc_t *rec;
947         uint64_t *avg, *quantized;
948         int i, j;
949
950         assert(lsdata->lsd_count < g_nrecs);
951
952         /*
953          * Aggregation variable IDs are guaranteed to be generated in program
954          * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
955          * plus one.  As "avg" appears before "hist" in program order, we know
956          * that "avg" will be allocated the first aggregation variable ID, and
957          * "hist" will be allocated the second aggregation variable ID -- and
958          * we therefore use the aggregation variable ID to differentiate the
959          * cases.
960          */
961         if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
962                 /*
963                  * If this is the histogram entry.  We'll copy the quantized
964                  * data into lc_hist, and jump over the rest.
965                  */
966                 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
967
968                 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
969                         fail(0, "bad variable ID in aggregation record");
970
971                 if (rec->dtrd_size !=
972                     DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
973                         fail(0, "bad quantize size in aggregation record");
974
975                 /* LINTED - alignment */
976                 quantized = (uint64_t *)(data + rec->dtrd_offset);
977
978                 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
979                     i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
980                         lsrec->ls_hist[j] = quantized[i];
981
982                 goto out;
983         }
984
985         lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
986             aggdesc->dtagd_nrecs - 1, data);
987
988         rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
989
990         if (rec->dtrd_size != 2 * sizeof (uint64_t))
991                 fail(0, "bad avg size in aggregation record");
992
993         /* LINTED - alignment */
994         avg = (uint64_t *)(data + rec->dtrd_offset);
995         lsrec->ls_count = (uint32_t)avg[0];
996         lsrec->ls_time = (uintptr_t)avg[1];
997
998         if (g_recsize >= LS_HIST)
999                 return (DTRACE_AGGWALK_NEXT);
1000
1001 out:
1002         lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1003         lsdata->lsd_count++;
1004
1005         return (DTRACE_AGGWALK_NEXT);
1006 }
1007
1008 static int
1009 process_trace(const dtrace_probedata_t *pdata, void *arg)
1010 {
1011         lsdata_t *lsdata = arg;
1012         lsrec_t *lsrec = lsdata->lsd_next;
1013         dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
1014         caddr_t data = pdata->dtpda_data;
1015
1016         if (lsdata->lsd_count >= g_nrecs)
1017                 return (DTRACE_CONSUME_NEXT);
1018
1019         lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);
1020
1021         lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
1022         lsdata->lsd_count++;
1023
1024         return (DTRACE_CONSUME_NEXT);
1025 }
1026
1027 static int
1028 process_data(FILE *out, char *data)
1029 {
1030         lsdata_t lsdata;
1031
1032         /* LINTED - alignment */
1033         lsdata.lsd_next = (lsrec_t *)data;
1034         lsdata.lsd_count = 0;
1035
1036         if (g_tracing) {
1037                 if (dtrace_consume(g_dtp, out,
1038                     process_trace, NULL, &lsdata) != 0)
1039                         dfail("failed to consume buffer");
1040
1041                 return (lsdata.lsd_count);
1042         }
1043
1044         if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
1045             process_aggregate, &lsdata) != 0)
1046                 dfail("failed to walk aggregate");
1047
1048         return (lsdata.lsd_count);
1049 }
1050
1051 /*ARGSUSED*/
1052 static int
1053 drophandler(const dtrace_dropdata_t *data, void *arg)
1054 {
1055         g_dropped++;
1056         (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
1057         return (DTRACE_HANDLE_OK);
1058 }
1059
1060 int
1061 main(int argc, char **argv)
1062 {
1063         char *data_buf;
1064         lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
1065         FILE *out = stdout;
1066         int c;
1067         pid_t child;
1068         int status;
1069         int i, j;
1070         hrtime_t duration;
1071         char *addrp, *offp, *sizep, *evp, *lastp, *p;
1072         uintptr_t addr;
1073         size_t size, off;
1074         int events_specified = 0;
1075         int exec_errno = 0;
1076         uint32_t event;
1077         char *filt = NULL, *ifilt = NULL;
1078         static uint64_t ev_count[LS_MAX_EVENTS + 1];
1079         static uint64_t ev_time[LS_MAX_EVENTS + 1];
1080         dtrace_optval_t aggsize;
1081         char aggstr[10];
1082         long ncpus;
1083         int dynvar = 0;
1084         int err;
1085
1086         if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
1087                 fail(0, "cannot open dtrace library: %s",
1088                     dtrace_errmsg(NULL, err));
1089         }
1090
1091         if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
1092                 dfail("couldn't establish drop handler");
1093
1094         if (symtab_init() == -1)
1095                 fail(1, "can't load kernel symbols");
1096
1097         g_nrecs = DEFAULT_NRECS;
1098
1099         while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1100                 switch (c) {
1101                 case 'b':
1102                         g_recsize = LS_BASIC;
1103                         break;
1104
1105                 case 't':
1106                         g_recsize = LS_TIME;
1107                         break;
1108
1109                 case 'h':
1110                         g_recsize = LS_HIST;
1111                         break;
1112
1113                 case 's':
1114                         if (!isdigit(optarg[0]))
1115                                 usage();
1116                         g_stkdepth = atoi(optarg);
1117                         if (g_stkdepth > LS_MAX_STACK_DEPTH)
1118                                 fail(0, "max stack depth is %d",
1119                                     LS_MAX_STACK_DEPTH);
1120                         g_recsize = LS_STACK(g_stkdepth);
1121                         break;
1122
1123                 case 'n':
1124                         if (!isdigit(optarg[0]))
1125                                 usage();
1126                         g_nrecs = atoi(optarg);
1127                         break;
1128
1129                 case 'd':
1130                         if (!isdigit(optarg[0]))
1131                                 usage();
1132                         duration = atoll(optarg);
1133
1134                         /*
1135                          * XXX -- durations really should be per event
1136                          * since the units are different, but it's hard
1137                          * to express this nicely in the interface.
1138                          * Not clear yet what the cleanest solution is.
1139                          */
1140                         for (i = 0; i < LS_MAX_EVENTS; i++)
1141                                 if (g_event_info[i].ev_type != 'E')
1142                                         g_min_duration[i] = duration;
1143
1144                         break;
1145
1146                 case 'i':
1147                         if (!isdigit(optarg[0]))
1148                                 usage();
1149                         i = atoi(optarg);
1150                         if (i <= 0)
1151                                 usage();
1152                         if (i > MAX_HZ)
1153                                 fail(0, "max interrupt rate is %d Hz", MAX_HZ);
1154
1155                         for (j = 0; j < LS_MAX_EVENTS; j++)
1156                                 if (strcmp(g_event_info[j].ev_desc,
1157                                     "Profiling interrupt") == 0)
1158                                         break;
1159
1160                         (void) sprintf(g_event_info[j].ev_name,
1161                             "profile:::profile-%d", i);
1162                         break;
1163
1164                 case 'l':
1165                 case 'f':
1166                         addrp = strtok(optarg, ",");
1167                         sizep = strtok(NULL, ",");
1168                         addrp = strtok(optarg, ",+");
1169                         offp = strtok(NULL, ",");
1170
1171                         size = sizep ? strtoul(sizep, NULL, 0) : 1;
1172                         off = offp ? strtoul(offp, NULL, 0) : 0;
1173
1174                         if (addrp[0] == '0') {
1175                                 addr = strtoul(addrp, NULL, 16) + off;
1176                         } else {
1177                                 addr = sym_to_addr(addrp) + off;
1178                                 if (sizep == NULL)
1179                                         size = sym_size(addrp) - off;
1180                                 if (addr - off == 0)
1181                                         fail(0, "symbol '%s' not found", addrp);
1182                                 if (size == 0)
1183                                         size = 1;
1184                         }
1185
1186
1187                         if (c == 'l') {
1188                                 filter_add(&filt, "arg0", addr, size);
1189                         } else {
1190                                 filter_add(&filt, "caller", addr, size);
1191                                 filter_add(&ifilt, "arg0", addr, size);
1192                         }
1193                         break;
1194
1195                 case 'e':
1196                         evp = strtok_r(optarg, ",", &lastp);
1197                         while (evp) {
1198                                 int ev1, ev2;
1199                                 char *evp2;
1200
1201                                 (void) strtok(evp, "-");
1202                                 evp2 = strtok(NULL, "-");
1203                                 ev1 = atoi(evp);
1204                                 ev2 = evp2 ? atoi(evp2) : ev1;
1205                                 if ((uint_t)ev1 >= LS_MAX_EVENTS ||
1206                                     (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
1207                                         fail(0, "-e events out of range");
1208                                 for (i = ev1; i <= ev2; i++)
1209                                         g_enabled[i] = 1;
1210                                 evp = strtok_r(NULL, ",", &lastp);
1211                         }
1212                         events_specified = 1;
1213                         break;
1214
1215                 case 'c':
1216                         g_cflag = 1;
1217                         break;
1218
1219                 case 'k':
1220                         g_kflag = 1;
1221                         break;
1222
1223                 case 'w':
1224                         g_wflag = 1;
1225                         break;
1226
1227                 case 'W':
1228                         g_Wflag = 1;
1229                         break;
1230
1231                 case 'g':
1232                         g_gflag = 1;
1233                         break;
1234
1235                 case 'C':
1236                 case 'E':
1237                 case 'H':
1238                 case 'I':
1239                         for (i = 0; i < LS_MAX_EVENTS; i++)
1240                                 if (g_event_info[i].ev_type == c)
1241                                         g_enabled[i] = 1;
1242                         events_specified = 1;
1243                         break;
1244
1245                 case 'A':
1246                         for (i = 0; i < LS_MAX_EVENTS; i++)
1247                                 if (strchr("CH", g_event_info[i].ev_type))
1248                                         g_enabled[i] = 1;
1249                         events_specified = 1;
1250                         break;
1251
1252                 case 'T':
1253                         g_tracing = 1;
1254                         break;
1255
1256                 case 'D':
1257                         if (!isdigit(optarg[0]))
1258                                 usage();
1259                         g_topn = atoi(optarg);
1260                         break;
1261
1262                 case 'R':
1263                         g_rates = 1;
1264                         break;
1265
1266                 case 'p':
1267                         g_pflag = 1;
1268                         break;
1269
1270                 case 'P':
1271                         g_Pflag = 1;
1272                         break;
1273
1274                 case 'o':
1275                         if ((out = fopen(optarg, "w")) == NULL)
1276                                 fail(1, "error opening file");
1277                         break;
1278
1279                 case 'V':
1280                         g_Vflag = 1;
1281                         break;
1282
1283                 default:
1284                         if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
1285                                 usage();
1286                 }
1287         }
1288
1289         if (filt != NULL) {
1290                 predicate_add(&g_predicate, filt, NULL, 0);
1291                 filter_destroy(&filt);
1292         }
1293
1294         if (ifilt != NULL) {
1295                 predicate_add(&g_ipredicate, ifilt, NULL, 0);
1296                 filter_destroy(&ifilt);
1297         }
1298
1299         if (g_recsize == 0) {
1300                 if (g_gflag) {
1301                         g_stkdepth = LS_MAX_STACK_DEPTH;
1302                         g_recsize = LS_STACK(g_stkdepth);
1303                 } else {
1304                         g_recsize = LS_TIME;
1305                 }
1306         }
1307
1308         if (g_gflag && g_recsize <= LS_STACK(0))
1309                 fail(0, "'-g' requires at least '-s 1' data gathering");
1310
1311         /*
1312          * Make sure the alignment is reasonable
1313          */
1314         g_recsize = -(-g_recsize & -sizeof (uint64_t));
1315
1316         for (i = 0; i < LS_MAX_EVENTS; i++) {
1317                 /*
1318                  * If no events were specified, enable -C.
1319                  */
1320                 if (!events_specified && g_event_info[i].ev_type == 'C')
1321                         g_enabled[i] = 1;
1322         }
1323
1324         for (i = 0; i < LS_MAX_EVENTS; i++) {
1325                 if (!g_enabled[i])
1326                         continue;
1327
1328                 if (g_event_info[i].ev_acquire != NULL) {
1329                         /*
1330                          * If we've enabled a hold event, we must explicitly
1331                          * allocate dynamic variable space.
1332                          */
1333                         dynvar = 1;
1334                 }
1335
1336                 dprog_addevent(i);
1337         }
1338
1339         /*
1340          * Make sure there are remaining arguments to specify a child command
1341          * to execute.
1342          */
1343         if (argc <= optind)
1344                 usage();
1345
1346         if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
1347                 dfail("couldn't determine number of online CPUs");
1348
1349         /*
1350          * By default, we set our data buffer size to be the number of records
1351          * multiplied by the size of the record, doubled to account for some
1352          * DTrace slop and divided by the number of CPUs.  We silently clamp
1353          * the aggregation size at both a minimum and a maximum to prevent
1354          * absurdly low or high values.
1355          */
1356         if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
1357                 aggsize = MIN_AGGSIZE;
1358
1359         if (aggsize > MAX_AGGSIZE)
1360                 aggsize = MAX_AGGSIZE;
1361
1362         (void) sprintf(aggstr, "%lld", (long long)aggsize);
1363
1364         if (!g_tracing) {
1365                 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
1366                         dfail("failed to set 'bufsize'");
1367
1368                 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
1369                         dfail("failed to set 'aggsize'");
1370
1371                 if (dynvar) {
1372                         /*
1373                          * If we're using dynamic variables, we set our
1374                          * dynamic variable size to be one megabyte per CPU,
1375                          * with a hard-limit of 32 megabytes.  This may still
1376                          * be too small in some cases, but it can be tuned
1377                          * manually via -x if need be.
1378                          */
1379                         (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);
1380
1381                         if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
1382                                 dfail("failed to set 'dynvarsize'");
1383                 }
1384         } else {
1385                 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
1386                         dfail("failed to set 'bufsize'");
1387         }
1388
1389         if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
1390                 dfail("failed to set 'statusrate'");
1391
1392         optind = 1;
1393         while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) {
1394                 switch (c) {
1395                 case 'x':
1396                         if ((p = strchr(optarg, '=')) != NULL)
1397                                 *p++ = '\0';
1398
1399                         if (dtrace_setopt(g_dtp, optarg, p) != 0)
1400                                 dfail("failed to set -x %s", optarg);
1401                         break;
1402                 }
1403         }
1404
1405         argc -= optind;
1406         argv += optind;
1407
1408         dprog_compile();
1409         status_init();
1410
1411         g_elapsed = -gethrtime();
1412
1413         /*
1414          * Spawn the specified command and wait for it to complete.
1415          */
1416         child = fork();
1417         if (child == -1)
1418                 fail(1, "cannot fork");
1419         if (child == 0) {
1420                 (void) dtrace_close(g_dtp);
1421                 (void) execvp(argv[0], &argv[0]);
1422                 exec_errno = errno;
1423                 exit(127);
1424         }
1425
1426 #if defined(sun)
1427         while (waitpid(child, &status, WEXITED) != child)
1428 #else
1429         while (waitpid(child, &status, 0) != child)
1430 #endif
1431                 status_check();
1432
1433         g_elapsed += gethrtime();
1434
1435         if (WIFEXITED(status)) {
1436                 if (WEXITSTATUS(status) != 0) {
1437                         if (exec_errno != 0) {
1438                                 errno = exec_errno;
1439                                 fail(1, "could not execute %s", argv[0]);
1440                         }
1441                         (void) fprintf(stderr,
1442                             "lockstat: warning: %s exited with code %d\n",
1443                             argv[0], WEXITSTATUS(status));
1444                 }
1445         } else {
1446                 (void) fprintf(stderr,
1447                     "lockstat: warning: %s died on signal %d\n",
1448                     argv[0], WTERMSIG(status));
1449         }
1450
1451         if (dtrace_stop(g_dtp) == -1)
1452                 dfail("failed to stop dtrace");
1453
1454         /*
1455          * Before we read out the results, we need to allocate our buffer.
1456          * If we're tracing, then we'll just use the precalculated size.  If
1457          * we're not, then we'll take a snapshot of the aggregate, and walk
1458          * it to count the number of records.
1459          */
1460         if (!g_tracing) {
1461                 if (dtrace_aggregate_snap(g_dtp) != 0)
1462                         dfail("failed to snap aggregate");
1463
1464                 g_nrecs = 0;
1465
1466                 if (dtrace_aggregate_walk(g_dtp,
1467                     count_aggregate, &g_nrecs) != 0)
1468                         dfail("failed to walk aggregate");
1469         }
1470
1471 #if defined(sun)
1472         if ((data_buf = memalign(sizeof (uint64_t),
1473             (g_nrecs + 1) * g_recsize)) == NULL)
1474 #else
1475         if (posix_memalign((void **)&data_buf, sizeof (uint64_t),  
1476             (g_nrecs + 1) * g_recsize) )
1477 #endif
1478                 fail(1, "Memory allocation failed");
1479
1480         /*
1481          * Read out the DTrace data.
1482          */
1483         g_nrecs_used = process_data(out, data_buf);
1484
1485         if (g_nrecs_used > g_nrecs || g_dropped)
1486                 (void) fprintf(stderr, "lockstat: warning: "
1487                     "ran out of data records (use -n for more)\n");
1488
1489         /* LINTED - alignment */
1490         for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1491             /* LINTED - alignment */
1492             lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1493                 ev_count[lsp->ls_event] += lsp->ls_count;
1494                 ev_time[lsp->ls_event] += lsp->ls_time;
1495         }
1496
1497         /*
1498          * If -g was specified, convert stacks into individual records.
1499          */
1500         if (g_gflag) {
1501                 lsrec_t *newlsp, *oldlsp;
1502
1503 #if defined(sun)
1504                 newlsp = memalign(sizeof (uint64_t),
1505                     g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1506 #else
1507                 posix_memalign((void **)&newlsp, sizeof (uint64_t), 
1508                     g_nrecs_used * LS_TIME * (g_stkdepth + 1));
1509 #endif
1510                 if (newlsp == NULL)
1511                         fail(1, "Cannot allocate space for -g processing");
1512                 lsp = newlsp;
1513                 /* LINTED - alignment */
1514                 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1515                     /* LINTED - alignment */
1516                     oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
1517                         int fr;
1518                         int caller_in_stack = 0;
1519
1520                         if (oldlsp->ls_count == 0)
1521                                 continue;
1522
1523                         for (fr = 0; fr < g_stkdepth; fr++) {
1524                                 if (oldlsp->ls_stack[fr] == 0)
1525                                         break;
1526                                 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
1527                                         caller_in_stack = 1;
1528                                 bcopy(oldlsp, lsp, LS_TIME);
1529                                 lsp->ls_caller = oldlsp->ls_stack[fr];
1530                                 /* LINTED - alignment */
1531                                 lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1532                         }
1533                         if (!caller_in_stack) {
1534                                 bcopy(oldlsp, lsp, LS_TIME);
1535                                 /* LINTED - alignment */
1536                                 lsp = (lsrec_t *)((char *)lsp + LS_TIME);
1537                         }
1538                 }
1539                 g_nrecs = g_nrecs_used =
1540                     ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
1541                 g_recsize = LS_TIME;
1542                 g_stkdepth = 0;
1543                 free(data_buf);
1544                 data_buf = (char *)newlsp;
1545         }
1546
1547         if ((sort_buf = calloc(2 * (g_nrecs + 1),
1548             sizeof (void *))) == NULL)
1549                 fail(1, "Sort buffer allocation failed");
1550         merge_buf = sort_buf + (g_nrecs + 1);
1551
1552         /*
1553          * Build the sort buffer, discarding zero-count records along the way.
1554          */
1555         /* LINTED - alignment */
1556         for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
1557             /* LINTED - alignment */
1558             lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
1559                 if (lsp->ls_count == 0)
1560                         lsp->ls_event = LS_MAX_EVENTS;
1561                 sort_buf[i] = lsp;
1562         }
1563
1564         if (g_nrecs_used == 0)
1565                 exit(0);
1566
1567         /*
1568          * Add a sentinel after the last record
1569          */
1570         sort_buf[i] = lsp;
1571         lsp->ls_event = LS_MAX_EVENTS;
1572
1573         if (g_tracing) {
1574                 report_trace(out, sort_buf);
1575                 return (0);
1576         }
1577
1578         /*
1579          * Application of -g may have resulted in multiple records
1580          * with the same signature; coalesce them.
1581          */
1582         if (g_gflag) {
1583                 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1584                 coalesce(lockcmp, sort_buf, g_nrecs_used);
1585         }
1586
1587         /*
1588          * Coalesce locks within the same symbol if -c option specified.
1589          * Coalesce PCs within the same function if -k option specified.
1590          */
1591         if (g_cflag || g_kflag) {
1592                 for (i = 0; i < g_nrecs_used; i++) {
1593                         int fr;
1594                         lsp = sort_buf[i];
1595                         if (g_cflag)
1596                                 coalesce_symbol(&lsp->ls_lock);
1597                         if (g_kflag) {
1598                                 for (fr = 0; fr < g_stkdepth; fr++)
1599                                         coalesce_symbol(&lsp->ls_stack[fr]);
1600                                 coalesce_symbol(&lsp->ls_caller);
1601                         }
1602                 }
1603                 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
1604                 coalesce(lockcmp, sort_buf, g_nrecs_used);
1605         }
1606
1607         /*
1608          * Coalesce callers if -w option specified
1609          */
1610         if (g_wflag) {
1611                 mergesort(lock_and_count_cmp_anywhere,
1612                     sort_buf, merge_buf, g_nrecs_used);
1613                 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
1614         }
1615
1616         /*
1617          * Coalesce locks if -W option specified
1618          */
1619         if (g_Wflag) {
1620                 mergesort(site_and_count_cmp_anylock,
1621                     sort_buf, merge_buf, g_nrecs_used);
1622                 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
1623         }
1624
1625         /*
1626          * Sort data by contention count (ls_count) or total time (ls_time),
1627          * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
1628          */
1629         if (g_recsize < LS_TIME)
1630                 g_Pflag = 0;
1631
1632         if (g_Pflag)
1633                 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
1634         else
1635                 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);
1636
1637         /*
1638          * Display data by event type
1639          */
1640         first = &sort_buf[0];
1641         while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
1642                 current = first;
1643                 while ((lsp = *current)->ls_event == event)
1644                         current++;
1645                 report_stats(out, first, current - first, ev_count[event],
1646                     ev_time[event]);
1647                 first = current;
1648         }
1649
1650         return (0);
1651 }
1652
1653 static char *
1654 format_symbol(char *buf, uintptr_t addr, int show_size)
1655 {
1656         uintptr_t symoff;
1657         char *symname;
1658         size_t symsize;
1659
1660         symname = addr_to_sym(addr, &symoff, &symsize);
1661
1662         if (show_size && symoff == 0)
1663                 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
1664         else if (symoff == 0)
1665                 (void) sprintf(buf, "%s", symname);
1666         else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)  /* CPU+PIL */
1667 #if defined(sun)
1668                 (void) sprintf(buf, "%s+%ld", symname, (long)symoff);
1669 #else
1670                 (void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]);
1671 #endif
1672         else if (symoff <= symsize || (symoff < 256 && addr != symoff))
1673                 (void) sprintf(buf, "%s+0x%llx", symname,
1674                     (unsigned long long)symoff);
1675         else
1676                 (void) sprintf(buf, "0x%llx", (unsigned long long)addr);
1677         return (buf);
1678 }
1679
1680 static void
1681 report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
1682         uint64_t total_time)
1683 {
1684         uint32_t event = sort_buf[0]->ls_event;
1685         lsrec_t *lsp;
1686         double ptotal = 0.0;
1687         double percent;
1688         int i, j, fr;
1689         int displayed;
1690         int first_bin, last_bin, max_bin_count, total_bin_count;
1691         int rectype;
1692         char buf[256];
1693         char lhdr[80], chdr[80];
1694
1695         rectype = g_recsize;
1696
1697         if (g_topn == 0) {
1698                 (void) fprintf(out, "%20llu %s\n",
1699                     g_rates == 0 ? total_count :
1700                     ((unsigned long long)total_count * NANOSEC) / g_elapsed,
1701                     g_event_info[event].ev_desc);
1702                 return;
1703         }
1704
1705         (void) sprintf(lhdr, "%s%s",
1706             g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
1707         (void) sprintf(chdr, "%s%s",
1708             g_wflag ? "Hottest " : "", "Caller");
1709
1710         if (!g_pflag)
1711                 (void) fprintf(out,
1712                     "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
1713                     g_event_info[event].ev_desc, (double)total_count,
1714                     (double)g_elapsed / NANOSEC,
1715                     (double)total_count * NANOSEC / g_elapsed);
1716
1717         if (!g_pflag && rectype < LS_HIST) {
1718                 (void) sprintf(buf, "%s", g_event_info[event].ev_units);
1719                 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1720                     g_rates ? "ops/s" : "Count",
1721                     g_gflag ? "genr" : "indv",
1722                     "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
1723                 (void) fprintf(out, "---------------------------------"
1724                     "----------------------------------------------\n");
1725         }
1726
1727         displayed = 0;
1728         for (i = 0; i < nrecs; i++) {
1729                 lsp = sort_buf[i];
1730
1731                 if (displayed++ >= g_topn)
1732                         break;
1733
1734                 if (g_pflag) {
1735                         int j;
1736
1737                         (void) fprintf(out, "%u %u",
1738                             lsp->ls_event, lsp->ls_count);
1739                         (void) fprintf(out, " %s",
1740                             format_symbol(buf, lsp->ls_lock, g_cflag));
1741                         (void) fprintf(out, " %s",
1742                             format_symbol(buf, lsp->ls_caller, 0));
1743                         (void) fprintf(out, " %f",
1744                             (double)lsp->ls_refcnt / lsp->ls_count);
1745                         if (rectype >= LS_TIME)
1746                                 (void) fprintf(out, " %llu",
1747                                     (unsigned long long)lsp->ls_time);
1748                         if (rectype >= LS_HIST) {
1749                                 for (j = 0; j < 64; j++)
1750                                         (void) fprintf(out, " %u",
1751                                             lsp->ls_hist[j]);
1752                         }
1753                         for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
1754                                 if (rectype <= LS_STACK(j) ||
1755                                     lsp->ls_stack[j] == 0)
1756                                         break;
1757                                 (void) fprintf(out, " %s",
1758                                     format_symbol(buf, lsp->ls_stack[j], 0));
1759                         }
1760                         (void) fprintf(out, "\n");
1761                         continue;
1762                 }
1763
1764                 if (rectype >= LS_HIST) {
1765                         (void) fprintf(out, "---------------------------------"
1766                             "----------------------------------------------\n");
1767                         (void) sprintf(buf, "%s",
1768                             g_event_info[event].ev_units);
1769                         (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
1770                             g_rates ? "ops/s" : "Count",
1771                             g_gflag ? "genr" : "indv",
1772                             "cuml", "rcnt", buf, lhdr, chdr);
1773                 }
1774
1775                 if (g_Pflag && total_time != 0)
1776                         percent = (lsp->ls_time * 100.00) / total_time;
1777                 else
1778                         percent = (lsp->ls_count * 100.00) / total_count;
1779
1780                 ptotal += percent;
1781
1782                 if (rectype >= LS_TIME)
1783                         (void) sprintf(buf, "%llu",
1784                             (unsigned long long)(lsp->ls_time / lsp->ls_count));
1785                 else
1786                         buf[0] = '\0';
1787
1788                 (void) fprintf(out, "%5llu ",
1789                     g_rates == 0 ? lsp->ls_count :
1790                     ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);
1791
1792                 (void) fprintf(out, "%3.0f%% ", percent);
1793
1794                 if (g_gflag)
1795                         (void) fprintf(out, "---- ");
1796                 else
1797                         (void) fprintf(out, "%3.0f%% ", ptotal);
1798
1799                 (void) fprintf(out, "%4.2f %8s ",
1800                     (double)lsp->ls_refcnt / lsp->ls_count, buf);
1801
1802                 (void) fprintf(out, "%-22s ",
1803                     format_symbol(buf, lsp->ls_lock, g_cflag));
1804
1805                 (void) fprintf(out, "%-24s\n",
1806                     format_symbol(buf, lsp->ls_caller, 0));
1807
1808                 if (rectype < LS_HIST)
1809                         continue;
1810
1811                 (void) fprintf(out, "\n");
1812                 (void) fprintf(out, "%10s %31s %-9s %-24s\n",
1813                     g_event_info[event].ev_units,
1814                     "------ Time Distribution ------",
1815                     g_rates ? "ops/s" : "count",
1816                     rectype > LS_STACK(0) ? "Stack" : "");
1817
1818                 first_bin = 0;
1819                 while (lsp->ls_hist[first_bin] == 0)
1820                         first_bin++;
1821
1822                 last_bin = 63;
1823                 while (lsp->ls_hist[last_bin] == 0)
1824                         last_bin--;
1825
1826                 max_bin_count = 0;
1827                 total_bin_count = 0;
1828                 for (j = first_bin; j <= last_bin; j++) {
1829                         total_bin_count += lsp->ls_hist[j];
1830                         if (lsp->ls_hist[j] > max_bin_count)
1831                                 max_bin_count = lsp->ls_hist[j];
1832                 }
1833
1834                 /*
1835                  * If we went a few frames below the caller, ignore them
1836                  */
1837                 for (fr = 3; fr > 0; fr--)
1838                         if (lsp->ls_stack[fr] == lsp->ls_caller)
1839                                 break;
1840
1841                 for (j = first_bin; j <= last_bin; j++) {
1842                         uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
1843                         (void) fprintf(out, "%10llu |%s%s %-9u ",
1844                             1ULL << j,
1845                             "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
1846                             "                              " + depth,
1847                             g_rates == 0 ? lsp->ls_hist[j] :
1848                             (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
1849                             g_elapsed));
1850                         if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
1851                                 (void) fprintf(out, "\n");
1852                                 continue;
1853                         }
1854                         (void) fprintf(out, "%-24s\n",
1855                             format_symbol(buf, lsp->ls_stack[fr], 0));
1856                         fr++;
1857                 }
1858                 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1859                         (void) fprintf(out, "%15s %-36s %-24s\n", "", "",
1860                             format_symbol(buf, lsp->ls_stack[fr], 0));
1861                         fr++;
1862                 }
1863         }
1864
1865         if (!g_pflag)
1866                 (void) fprintf(out, "---------------------------------"
1867                     "----------------------------------------------\n");
1868
1869         (void) fflush(out);
1870 }
1871
1872 static void
1873 report_trace(FILE *out, lsrec_t **sort_buf)
1874 {
1875         lsrec_t *lsp;
1876         int i, fr;
1877         int rectype;
1878         char buf[256], buf2[256];
1879
1880         rectype = g_recsize;
1881
1882         if (!g_pflag) {
1883                 (void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
1884                     "Event", "Time", "Owner", "Lock", "Caller");
1885                 (void) fprintf(out, "---------------------------------"
1886                     "----------------------------------------------\n");
1887         }
1888
1889         for (i = 0; i < g_nrecs_used; i++) {
1890
1891                 lsp = sort_buf[i];
1892
1893                 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
1894                         continue;
1895
1896                 (void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
1897                     lsp->ls_event, (unsigned long long)lsp->ls_time,
1898                     (void *)lsp->ls_next,
1899                     format_symbol(buf, lsp->ls_lock, 0),
1900                     format_symbol(buf2, lsp->ls_caller, 0));
1901
1902                 if (rectype <= LS_STACK(0))
1903                         continue;
1904
1905                 /*
1906                  * If we went a few frames below the caller, ignore them
1907                  */
1908                 for (fr = 3; fr > 0; fr--)
1909                         if (lsp->ls_stack[fr] == lsp->ls_caller)
1910                                 break;
1911
1912                 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
1913                         (void) fprintf(out, "%53s  %-24s\n", "",
1914                             format_symbol(buf, lsp->ls_stack[fr], 0));
1915                         fr++;
1916                 }
1917                 (void) fprintf(out, "\n");
1918         }
1919
1920         (void) fflush(out);
1921 }