3 # dexplorer - DTrace system explorer, runs a collection of scripts.
4 # Written using DTrace (Solaris 10 3/05).
6 # This program automatically runs a collection of DTrace scripts to examine
7 # many areas of the system, and places the output in a meaningful directory
8 # structure that is tar'd and gzip'd.
10 # $Id: dexplorer 3 2007-08-01 10:50:08Z brendan $
12 # USAGE: dexplorer [-yDT] [-d outputdir] [-i interval]
15 # -y # "yes", don't prompt for confirmation
16 # -D # don't delete output dir
17 # -T # don't create output tar.gz
18 # -d outputdir # output directory
19 # -i interval # interval for each sample
21 # dexplorer # default is 5 second samples
22 # dexplorer -y -i30 # no prompting, with 30 second samples
24 # SEE ALSO: DTraceToolkit
26 # THANKS: David Visser, et all. for the idea and encouragement.
28 # COPYRIGHT: Copyright (c) 2005 Brendan Gregg.
32 # The contents of this file are subject to the terms of the
33 # Common Development and Distribution License, Version 1.0 only
34 # (the "License"). You may not use this file except in compliance
37 # You can obtain a copy of the license at Docs/cddl1.txt
38 # or http://www.opensolaris.org/os/licensing.
39 # See the License for the specific language governing permissions
40 # and limitations under the License.
46 # This is currently a monolithic script, and while it contains only
47 # a few dozen straigftforward DTrace scripts I think it's desirable to
48 # keep it that way. The scripts themselves have designed to be very
49 # generic (eg, switching on all sdt:::), and are aggregations to keep a
50 # limit on the size of the output.
52 # Author: Brendan Gregg [Sydney, Australia]
54 # 23-Jun-2005 Brendan Gregg Created this.
55 # 28-Jun-2005 " " Last update.
60 interval=5 # time of each sample
61 verbose=1 # print screen output
62 prompt=1 # prompt before run
63 tar=1 # create tar file
64 delete=1 # delete output dirs
65 dtrace=/usr/sbin/dtrace # path to dtrace
66 root=. # default output dir
67 PATH=/usr/bin:/usr/sbin # safe path
68 dir=de_`uname -n`_`date +%Y%m%d%H%M` # OUTPUT FILENAME
69 samples=20 # max number of tests
70 current=0 # current sample
75 while getopts d:hi:qyDT name
79 i) interval=$OPTARG ;;
85 USAGE: dexplorer [-qyDT] [-d outputdir] [-i interval]
88 -y # "yes", don't prompt for confirmation
89 -D # don't delete output dir
90 -T # don't create output tar.gz
91 -d outputdir # output directory
92 -i interval # interval for each sample
94 dexplorer # default is 5 second samples
95 dexplorer -y -i30 # no prompting, with 30 second samples
100 shift $(( OPTIND - 1 ))
105 if [[ "$prompt" == "1" ]] ; then
106 if [[ "$root" == "." ]]; then
107 print "Output dir will be the current dir ($PWD)."
109 print "Output dir will be $root"
111 print -n "Hit enter for yes, or type path: "
113 if [[ "$ans" == [yY] || "$ans" == [yY]es ]]; then
114 print "WARNING: I didn't ask for \"$ans\"!"
115 print "\tI was asking for the path or just enter."
116 print "\tignoring \"$ans\"..."
118 if [[ "$ans" != "" ]]; then
120 print "Output is now $root."
127 if [[ "$interval" == *[a-zA-Z]* ]]; then
128 print "ERROR2: Invalid interval $interval.\n"
129 print "Please use a number of seconds."
132 if (( ${#interval} < 1 )); then
133 print "ERROR3: Length of interval $interval too short.\n"
134 print "Minimum 1 second."
137 if [[ ! -d "$root" ]]; then
138 print "ERROR4: Output directory \"$root\" does not exist.\n"
139 print "Perhaps try a mkdir first?"
140 print "or use an existing dir, eg \"/tmp\""
143 if [[ ! -w "$root" ]]; then
144 print "ERROR5: Can't write to output directory \"$root\".\n"
145 print "Are you logged in as root?"
146 print "Perhaps try another directory, eg \"/tmp\""
149 if [[ `$dtrace -b1k -qn 'BEGIN { trace(pid); exit(0); }'` == "" ]]; then
150 print "ERROR6: Unable to run dtrace!\n"
151 print "Perhaps this is a permission problem? Try running as root."
155 # calculate total time
156 (( total = interval * samples ))
157 if (( total > 180 )); then
158 (( total = total / 60 ))
159 total="$total minutes"
161 total="$total seconds"
168 if (( verbose )); then print "$*"; fi
171 header='dtrace:::BEGIN {
172 printf("%Y, ", walltimestamp);
173 printf("%s %s %s %s %s, ", `utsname.sysname, `utsname.nodename,
174 `utsname.release, `utsname.version, `utsname.machine);
175 printf("%d secs\n",'$interval');
177 profile:::tick-'$interval'sec { exit(0); }
180 if (( verbose )); then
181 (( percent = current * 100 / samples ))
182 printf "%3d%% $*\n" $percent
183 (( current = current + 1 ))
187 ########################################
189 ########################################
203 if [[ "$base1" != "$base2" || "$err" != "0" ]]; then
204 print "ERROR7: tried to mkdir $dir from $root, but something failed.\n"
205 print "Check directories before rerunning."
218 decho "Starting dexplorer ver 0.76."
219 decho "Sample interval is $interval seconds. Total run is > $total."
220 ( print "dexplorer ver 0.76\n------------------"
227 # Capture Standard Info
229 args='pid,ppid,uid,gid,projid,zoneid,pset,pri,nice,'
230 args=$args'class,vsz,rss,time,pcpu,pmem,args'
231 uname -a > Info/uname-a # System
232 psrinfo -v > Info/psrinfo-v # CPU
233 prtconf > Info/prtconf # Memory (+ devices)
234 df -k > Info/df-k # Disk
235 ifconfig -a > Info/ifconfig-a # Network
236 ps -eo $args > Info/ps-o # Processes
237 uptime > Info/uptime # Load
243 dstatus "Interrupts by CPU..."
244 $dtrace -qn "$header"'
245 sdt:::interrupt-start { @num[cpu] = count(); }
248 printf("%-16s %16s\n", "CPU", "INTERRUPTS");
249 printa("%-16d %@16d\n", @num);
251 ' | $clean > Cpu/interrupt_by_cpu
253 dstatus "Interrupt times..."
254 $dtrace -qn "$header"'
255 sdt:::interrupt-start { self->ts = vtimestamp; }
256 sdt:::interrupt-complete
257 /self->ts && arg0 != 0/
259 this->devi = (struct dev_info *)arg0;
260 self->name = this->devi != 0 ?
261 stringof(`devnamesp[this->devi->devi_major].dn_name) : "?";
262 this->inst = this->devi != 0 ? this->devi->devi_instance : 0;
263 @num[self->name, this->inst] = sum(vtimestamp - self->ts);
266 sdt:::interrupt-complete { self->ts = 0; }
269 printf("%11s %16s\n", "DEVICE", "TIME (ns)");
270 printa("%10s%-3d %@16d\n", @num);
272 ' | $clean > Cpu/interrupt_time
274 dstatus "Dispatcher queue length by CPU..."
275 $dtrace -qn "$header"'
276 profile:::profile-1000
278 this->num = curthread->t_cpu->cpu_disp->disp_nrunnable;
279 @length[cpu] = lquantize(this->num, 0, 100, 1);
281 dtrace:::END { printa(" CPU %d%@d\n", @length); }
282 ' | $clean > Cpu/dispqlen_by_cpu
284 dstatus "Sdt counts..."
285 $dtrace -qn "$header"'
286 sdt:::{ @num[probefunc, probename] = count(); }
289 printf("%-32s %-32s %10s\n", "FUNC", "NAME", "COUNT");
290 printa("%-32s %-32s %@10d\n", @num);
292 ' | $clean > Cpu/sdt_count
298 dstatus "Pages paged in by process..."
299 $dtrace -qn "$header"'
300 vminfo:::pgpgin { @pg[pid, execname] = sum(arg0); }
303 printf("%6s %-16s %16s\n", "PID", "CMD", "PAGES");
304 printa("%6d %-16s %@16d\n", @pg);
306 ' | $clean > Disk/pgpgin_by_process
308 dstatus "Files opened successfully count..."
309 $dtrace -qn "$header"'
310 syscall::open*:entry { self->file = copyinstr(arg0); self->ok = 1; }
311 syscall::open*:return /self->ok && arg0 != -1/
313 @num[self->file] = count();
315 syscall::open*:return /self->ok/ { self->file = 0; self->ok = 0; }
318 printf("%-64s %8s\n", "FILE", "COUNT");
319 printa("%-64s %@8d\n", @num);
321 ' | $clean > Disk/fileopen_count
323 dstatus "Disk I/O size distribution by process..."
324 $dtrace -qn "$header"'
325 io:::start { @size[pid, execname] = quantize(args[0]->b_bcount); }
326 ' | $clean > Disk/sizedist_by_process
332 dstatus "Minor faults by process..."
333 $dtrace -qn "$header"'
334 vminfo:::as_fault { @mem[pid, execname] = sum(arg0); }
337 printf("%6s %-16s %16s\n", "PID", "CMD", "MINFAULTS");
338 printa("%6d %-16s %@16d\n", @mem);
340 ' | $clean > Mem/minf_by_process
343 dstatus "Vminfo data by process..."
344 $dtrace -qn "$header"'
345 vminfo::: { @data[pid, execname, probename] = sum(arg0); }
348 printf("%6s %-16s %-16s %16s\n",
349 "PID", "CMD", "STATISTIC", "VALUE");
350 printa("%6d %-16s %-16s %@16d\n", @data);
352 ' | $clean > Mem/vminfo_by_process
358 dstatus "Mib data by mib statistic..."
359 $dtrace -qn "$header"'
360 mib::: { @data[probename] = sum(arg0); }
363 printf("%-32s %16s\n", "STATISTIC", "VALUE");
364 printa("%-32s %@16d\n", @data);
366 ' | $clean > Net/mib_data
368 dstatus "TCP write bytes by process..."
369 $dtrace -qn "$header"'
370 fbt:ip:tcp_output:entry
372 this->size = msgdsize(args[1]);
373 @size[pid, execname] = sum(this->size);
377 printf("%6s %-16s %12s\n", "PID", "CMD", "BYTES");
378 printa("%6d %-16s %@12d\n", @size);
380 ' | $clean > Net/tcpw_by_process
386 dstatus "Sample process @ 1000 Hz..."
387 $dtrace -qn "$header"'
388 profile:::profile-1000
390 @num[pid, curpsinfo->pr_psargs] = count();
394 printf("%6s %12s %s\n", "PID", "SAMPLES", "ARGS");
395 printa("%6d %@12d %S\n", @num);
397 ' | $clean > Proc/sample_process
399 dstatus "Syscall count by process..."
400 $dtrace -qn "$header"'
401 syscall:::entry { @num[pid, execname, probefunc] = count(); }
404 printf("%6s %-24s %-24s %8s\n",
405 "PID", "CMD", "SYSCALL", "COUNT");
406 printa("%6d %-24s %-24s %@8d\n", @num);
408 ' | $clean > Proc/syscall_by_process
410 dstatus "Syscall count by syscall..."
411 $dtrace -qn "$header"'
412 syscall:::entry { @num[probefunc] = count(); }
415 printf("%-32s %16s\n", "SYSCALL", "COUNT");
416 printa("%-32s %@16d\n", @num);
418 ' | $clean > Proc/syscall_count
420 dstatus "Read bytes by process..."
421 $dtrace -qn "$header"'
422 sysinfo:::readch { @bytes[pid, execname] = sum(arg0); }
425 printf("%6s %-16s %16s\n", "PID", "CMD", "BYTES");
426 printa("%6d %-16s %@16d\n", @bytes);
428 ' | $clean > Proc/readb_by_process
430 dstatus "Write bytes by process..."
431 $dtrace -qn "$header"'
432 sysinfo:::writech { @bytes[pid, execname] = sum(arg0); }
435 printf("%6s %-16s %16s\n", "PID", "CMD", "BYTES");
436 printa("%6d %-16s %@16d\n", @bytes);
438 ' | $clean > Proc/writeb_by_process
440 dstatus "Sysinfo counts by process..."
441 $dtrace -qn "$header"'
442 sysinfo::: { @num[pid, execname, probename] = sum(arg0); }
445 printf("%6s %-16s %-16s %16s\n",
446 "PID", "CMD", "STATISTIC", "COUNT");
447 printa("%6d %-16s %-16s %@16d\n", @num);
449 ' | $clean > Proc/sysinfo_by_process
451 dstatus "New process counts with arguments..."
452 $dtrace -qn "$header"'
455 @num[pid, ppid, curpsinfo->pr_psargs] = count();
459 printf("%6s %6s %8s %s\n", "PID", "PPID", "COUNT", "ARGS");
460 printa("%6d %6d %@8d %S\n", @num);
462 ' | $clean > Proc/newprocess_count
464 dstatus "Signal counts..."
465 $dtrace -qn "$header"'
467 @num[execname,args[2],stringof(args[1]->pr_fname)] = count();
471 printf("%-16s %-8s %-16s %8s\n",
472 "FROM", "SIG", "TO", "COUNT");
473 printa("%-16s %-8d %-16s %@8d\n", @num);
475 ' | $clean > Proc/signal_count
477 dstatus "Syscall error counts..."
478 $dtrace -qn "$header"'
479 syscall:::return /(int)arg0 == -1/
481 @num[pid, execname, probefunc, errno] = count();
485 printf("%6s %-16s %-32s %-6s %8s\n",
486 "PID", "CMD", "SYSCALL", "ERRNO", "COUNT");
487 printa("%6d %-16s %-32s %-6d %@8d\n", @num);
489 ' | $clean > Proc/syscall_errors
502 decho "File is $dir.tar.gz"
504 if (( delete && tar )); then
506 # this could be all an "rm -r $dir", but since it will be run
507 # as root on production servers - lets be analy cautious,
508 rm Cpu/interrupt_by_cpu
509 rm Cpu/interrupt_time
510 rm Cpu/dispqlen_by_cpu
512 rm Disk/pgpgin_by_process
513 rm Disk/fileopen_count
514 rm Disk/sizedist_by_process
515 rm Mem/minf_by_process
516 rm Mem/vminfo_by_process
518 rm Net/tcpw_by_process
519 rm Proc/sample_process
520 rm Proc/syscall_by_process
521 rm Proc/syscall_count
522 rm Proc/readb_by_process
523 rm Proc/writeb_by_process
524 rm Proc/sysinfo_by_process
525 rm Proc/newprocess_count
527 rm Proc/syscall_errors
545 decho "Directory is $dir"