]> CyberLeo.Net >> Repos - FreeBSD/releng/10.0.git/blob - lib/libdevstat/devstat.c
- Copy stable/10 (r259064) to releng/10.0 as part of the
[FreeBSD/releng/10.0.git] / lib / libdevstat / devstat.c
1 /*
2  * Copyright (c) 1997, 1998 Kenneth D. Merry.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. The name of the author may not be used to endorse or promote products
14  *    derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31
32 #include <sys/types.h>
33 #include <sys/sysctl.h>
34 #include <sys/errno.h>
35 #include <sys/resource.h>
36 #include <sys/queue.h>
37
38 #include <ctype.h>
39 #include <err.h>
40 #include <fcntl.h>
41 #include <limits.h>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <string.h>
45 #include <stdarg.h>
46 #include <kvm.h>
47 #include <nlist.h>
48
49 #include "devstat.h"
50
51 int
52 compute_stats(struct devstat *current, struct devstat *previous,
53               long double etime, u_int64_t *total_bytes,
54               u_int64_t *total_transfers, u_int64_t *total_blocks,
55               long double *kb_per_transfer, long double *transfers_per_second,
56               long double *mb_per_second, long double *blocks_per_second,
57               long double *ms_per_transaction);
58
59 typedef enum {
60         DEVSTAT_ARG_NOTYPE,
61         DEVSTAT_ARG_UINT64,
62         DEVSTAT_ARG_LD,
63         DEVSTAT_ARG_SKIP
64 } devstat_arg_type;
65
66 char devstat_errbuf[DEVSTAT_ERRBUF_SIZE];
67
68 /*
69  * Table to match descriptive strings with device types.  These are in
70  * order from most common to least common to speed search time.
71  */
72 struct devstat_match_table match_table[] = {
73         {"da",          DEVSTAT_TYPE_DIRECT,    DEVSTAT_MATCH_TYPE},
74         {"cd",          DEVSTAT_TYPE_CDROM,     DEVSTAT_MATCH_TYPE},
75         {"scsi",        DEVSTAT_TYPE_IF_SCSI,   DEVSTAT_MATCH_IF},
76         {"ide",         DEVSTAT_TYPE_IF_IDE,    DEVSTAT_MATCH_IF},
77         {"other",       DEVSTAT_TYPE_IF_OTHER,  DEVSTAT_MATCH_IF},
78         {"worm",        DEVSTAT_TYPE_WORM,      DEVSTAT_MATCH_TYPE},
79         {"sa",          DEVSTAT_TYPE_SEQUENTIAL,DEVSTAT_MATCH_TYPE},
80         {"pass",        DEVSTAT_TYPE_PASS,      DEVSTAT_MATCH_PASS},
81         {"optical",     DEVSTAT_TYPE_OPTICAL,   DEVSTAT_MATCH_TYPE},
82         {"array",       DEVSTAT_TYPE_STORARRAY, DEVSTAT_MATCH_TYPE},
83         {"changer",     DEVSTAT_TYPE_CHANGER,   DEVSTAT_MATCH_TYPE},
84         {"scanner",     DEVSTAT_TYPE_SCANNER,   DEVSTAT_MATCH_TYPE},
85         {"printer",     DEVSTAT_TYPE_PRINTER,   DEVSTAT_MATCH_TYPE},
86         {"floppy",      DEVSTAT_TYPE_FLOPPY,    DEVSTAT_MATCH_TYPE},
87         {"proc",        DEVSTAT_TYPE_PROCESSOR, DEVSTAT_MATCH_TYPE},
88         {"comm",        DEVSTAT_TYPE_COMM,      DEVSTAT_MATCH_TYPE},
89         {"enclosure",   DEVSTAT_TYPE_ENCLOSURE, DEVSTAT_MATCH_TYPE},
90         {NULL,          0,                      0}
91 };
92
93 struct devstat_args {
94         devstat_metric          metric;
95         devstat_arg_type        argtype;
96 } devstat_arg_list[] = {
97         { DSM_NONE, DEVSTAT_ARG_NOTYPE },
98         { DSM_TOTAL_BYTES, DEVSTAT_ARG_UINT64 },
99         { DSM_TOTAL_BYTES_READ, DEVSTAT_ARG_UINT64 },
100         { DSM_TOTAL_BYTES_WRITE, DEVSTAT_ARG_UINT64 },
101         { DSM_TOTAL_TRANSFERS, DEVSTAT_ARG_UINT64 },
102         { DSM_TOTAL_TRANSFERS_READ, DEVSTAT_ARG_UINT64 },
103         { DSM_TOTAL_TRANSFERS_WRITE, DEVSTAT_ARG_UINT64 },
104         { DSM_TOTAL_TRANSFERS_OTHER, DEVSTAT_ARG_UINT64 },
105         { DSM_TOTAL_BLOCKS, DEVSTAT_ARG_UINT64 },
106         { DSM_TOTAL_BLOCKS_READ, DEVSTAT_ARG_UINT64 },
107         { DSM_TOTAL_BLOCKS_WRITE, DEVSTAT_ARG_UINT64 },
108         { DSM_KB_PER_TRANSFER, DEVSTAT_ARG_LD },
109         { DSM_KB_PER_TRANSFER_READ, DEVSTAT_ARG_LD },
110         { DSM_KB_PER_TRANSFER_WRITE, DEVSTAT_ARG_LD },
111         { DSM_TRANSFERS_PER_SECOND, DEVSTAT_ARG_LD },
112         { DSM_TRANSFERS_PER_SECOND_READ, DEVSTAT_ARG_LD },
113         { DSM_TRANSFERS_PER_SECOND_WRITE, DEVSTAT_ARG_LD },
114         { DSM_TRANSFERS_PER_SECOND_OTHER, DEVSTAT_ARG_LD },
115         { DSM_MB_PER_SECOND, DEVSTAT_ARG_LD },
116         { DSM_MB_PER_SECOND_READ, DEVSTAT_ARG_LD },
117         { DSM_MB_PER_SECOND_WRITE, DEVSTAT_ARG_LD },
118         { DSM_BLOCKS_PER_SECOND, DEVSTAT_ARG_LD },
119         { DSM_BLOCKS_PER_SECOND_READ, DEVSTAT_ARG_LD },
120         { DSM_BLOCKS_PER_SECOND_WRITE, DEVSTAT_ARG_LD },
121         { DSM_MS_PER_TRANSACTION, DEVSTAT_ARG_LD },
122         { DSM_MS_PER_TRANSACTION_READ, DEVSTAT_ARG_LD },
123         { DSM_MS_PER_TRANSACTION_WRITE, DEVSTAT_ARG_LD },
124         { DSM_SKIP, DEVSTAT_ARG_SKIP },
125         { DSM_TOTAL_BYTES_FREE, DEVSTAT_ARG_UINT64 },
126         { DSM_TOTAL_TRANSFERS_FREE, DEVSTAT_ARG_UINT64 },
127         { DSM_TOTAL_BLOCKS_FREE, DEVSTAT_ARG_UINT64 },
128         { DSM_KB_PER_TRANSFER_FREE, DEVSTAT_ARG_LD },
129         { DSM_MB_PER_SECOND_FREE, DEVSTAT_ARG_LD },
130         { DSM_TRANSFERS_PER_SECOND_FREE, DEVSTAT_ARG_LD },
131         { DSM_BLOCKS_PER_SECOND_FREE, DEVSTAT_ARG_LD },
132         { DSM_MS_PER_TRANSACTION_OTHER, DEVSTAT_ARG_LD },
133         { DSM_MS_PER_TRANSACTION_FREE, DEVSTAT_ARG_LD },
134         { DSM_BUSY_PCT, DEVSTAT_ARG_LD },
135         { DSM_QUEUE_LENGTH, DEVSTAT_ARG_UINT64 },
136         { DSM_TOTAL_DURATION, DEVSTAT_ARG_LD },
137         { DSM_TOTAL_DURATION_READ, DEVSTAT_ARG_LD },
138         { DSM_TOTAL_DURATION_WRITE, DEVSTAT_ARG_LD },
139         { DSM_TOTAL_DURATION_FREE, DEVSTAT_ARG_LD },
140         { DSM_TOTAL_DURATION_OTHER, DEVSTAT_ARG_LD },
141         { DSM_TOTAL_BUSY_TIME, DEVSTAT_ARG_LD },
142 };
143
144 static const char *namelist[] = {
145 #define X_NUMDEVS       0
146         "_devstat_num_devs",
147 #define X_GENERATION    1
148         "_devstat_generation",
149 #define X_VERSION       2
150         "_devstat_version",
151 #define X_DEVICE_STATQ  3
152         "_device_statq",
153 #define X_END           4
154 };
155
156 /*
157  * Local function declarations.
158  */
159 static int compare_select(const void *arg1, const void *arg2);
160 static int readkmem(kvm_t *kd, unsigned long addr, void *buf, size_t nbytes);
161 static int readkmem_nl(kvm_t *kd, const char *name, void *buf, size_t nbytes);
162 static char *get_devstat_kvm(kvm_t *kd);
163
164 #define KREADNL(kd, var, val) \
165         readkmem_nl(kd, namelist[var], &val, sizeof(val))
166
167 int
168 devstat_getnumdevs(kvm_t *kd)
169 {
170         size_t numdevsize;
171         int numdevs;
172
173         numdevsize = sizeof(int);
174
175         /*
176          * Find out how many devices we have in the system.
177          */
178         if (kd == NULL) {
179                 if (sysctlbyname("kern.devstat.numdevs", &numdevs,
180                                  &numdevsize, NULL, 0) == -1) {
181                         snprintf(devstat_errbuf, sizeof(devstat_errbuf),
182                                  "%s: error getting number of devices\n"
183                                  "%s: %s", __func__, __func__, 
184                                  strerror(errno));
185                         return(-1);
186                 } else
187                         return(numdevs);
188         } else {
189
190                 if (KREADNL(kd, X_NUMDEVS, numdevs) == -1)
191                         return(-1);
192                 else
193                         return(numdevs);
194         }
195 }
196
197 /*
198  * This is an easy way to get the generation number, but the generation is
199  * supplied in a more atmoic manner by the kern.devstat.all sysctl.
200  * Because this generation sysctl is separate from the statistics sysctl,
201  * the device list and the generation could change between the time that
202  * this function is called and the device list is retreived.
203  */
204 long
205 devstat_getgeneration(kvm_t *kd)
206 {
207         size_t gensize;
208         long generation;
209
210         gensize = sizeof(long);
211
212         /*
213          * Get the current generation number.
214          */
215         if (kd == NULL) {
216                 if (sysctlbyname("kern.devstat.generation", &generation, 
217                                  &gensize, NULL, 0) == -1) {
218                         snprintf(devstat_errbuf, sizeof(devstat_errbuf),
219                                  "%s: error getting devstat generation\n%s: %s",
220                                  __func__, __func__, strerror(errno));
221                         return(-1);
222                 } else
223                         return(generation);
224         } else {
225                 if (KREADNL(kd, X_GENERATION, generation) == -1)
226                         return(-1);
227                 else
228                         return(generation);
229         }
230 }
231
232 /*
233  * Get the current devstat version.  The return value of this function
234  * should be compared with DEVSTAT_VERSION, which is defined in
235  * sys/devicestat.h.  This will enable userland programs to determine
236  * whether they are out of sync with the kernel.
237  */
238 int
239 devstat_getversion(kvm_t *kd)
240 {
241         size_t versize;
242         int version;
243
244         versize = sizeof(int);
245
246         /*
247          * Get the current devstat version.
248          */
249         if (kd == NULL) {
250                 if (sysctlbyname("kern.devstat.version", &version, &versize,
251                                  NULL, 0) == -1) {
252                         snprintf(devstat_errbuf, sizeof(devstat_errbuf),
253                                  "%s: error getting devstat version\n%s: %s",
254                                  __func__, __func__, strerror(errno));
255                         return(-1);
256                 } else
257                         return(version);
258         } else {
259                 if (KREADNL(kd, X_VERSION, version) == -1)
260                         return(-1);
261                 else
262                         return(version);
263         }
264 }
265
266 /*
267  * Check the devstat version we know about against the devstat version the
268  * kernel knows about.  If they don't match, print an error into the
269  * devstat error buffer, and return -1.  If they match, return 0.
270  */
271 int
272 devstat_checkversion(kvm_t *kd)
273 {
274         int buflen, res, retval = 0, version;
275
276         version = devstat_getversion(kd);
277
278         if (version != DEVSTAT_VERSION) {
279                 /*
280                  * If getversion() returns an error (i.e. -1), then it
281                  * has printed an error message in the buffer.  Therefore,
282                  * we need to add a \n to the end of that message before we
283                  * print our own message in the buffer.
284                  */
285                 if (version == -1)
286                         buflen = strlen(devstat_errbuf);
287                 else
288                         buflen = 0;
289
290                 res = snprintf(devstat_errbuf + buflen,
291                                DEVSTAT_ERRBUF_SIZE - buflen,
292                                "%s%s: userland devstat version %d is not "
293                                "the same as the kernel\n%s: devstat "
294                                "version %d\n", version == -1 ? "\n" : "",
295                                __func__, DEVSTAT_VERSION, __func__, version);
296
297                 if (res < 0)
298                         devstat_errbuf[buflen] = '\0';
299
300                 buflen = strlen(devstat_errbuf);
301                 if (version < DEVSTAT_VERSION)
302                         res = snprintf(devstat_errbuf + buflen,
303                                        DEVSTAT_ERRBUF_SIZE - buflen,
304                                        "%s: libdevstat newer than kernel\n",
305                                        __func__);
306                 else
307                         res = snprintf(devstat_errbuf + buflen,
308                                        DEVSTAT_ERRBUF_SIZE - buflen,
309                                        "%s: kernel newer than libdevstat\n",
310                                        __func__);
311
312                 if (res < 0)
313                         devstat_errbuf[buflen] = '\0';
314
315                 retval = -1;
316         }
317
318         return(retval);
319 }
320
321 /*
322  * Get the current list of devices and statistics, and the current
323  * generation number.
324  * 
325  * Return values:
326  * -1  -- error
327  *  0  -- device list is unchanged
328  *  1  -- device list has changed
329  */
330 int
331 devstat_getdevs(kvm_t *kd, struct statinfo *stats)
332 {
333         int error;
334         size_t dssize;
335         int oldnumdevs;
336         long oldgeneration;
337         int retval = 0;
338         struct devinfo *dinfo;
339         struct timespec ts;
340
341         dinfo = stats->dinfo;
342
343         if (dinfo == NULL) {
344                 snprintf(devstat_errbuf, sizeof(devstat_errbuf),
345                          "%s: stats->dinfo was NULL", __func__);
346                 return(-1);
347         }
348
349         oldnumdevs = dinfo->numdevs;
350         oldgeneration = dinfo->generation;
351
352         clock_gettime(CLOCK_MONOTONIC, &ts);
353         stats->snap_time = ts.tv_sec + ts.tv_nsec * 1e-9;
354
355         if (kd == NULL) {
356                 /* If this is our first time through, mem_ptr will be null. */
357                 if (dinfo->mem_ptr == NULL) {
358                         /*
359                          * Get the number of devices.  If it's negative, it's an
360                          * error.  Don't bother setting the error string, since
361                          * getnumdevs() has already done that for us.
362                          */
363                         if ((dinfo->numdevs = devstat_getnumdevs(kd)) < 0)
364                                 return(-1);
365                         
366                         /*
367                          * The kern.devstat.all sysctl returns the current 
368                          * generation number, as well as all the devices.  
369                          * So we need four bytes more.
370                          */
371                         dssize = (dinfo->numdevs * sizeof(struct devstat)) +
372                                  sizeof(long);
373                         dinfo->mem_ptr = (u_int8_t *)malloc(dssize);
374                         if (dinfo->mem_ptr == NULL) {
375                                 snprintf(devstat_errbuf, sizeof(devstat_errbuf),
376                                          "%s: Cannot allocate memory for mem_ptr element",
377                                          __func__);
378                                 return(-1);
379                         }
380                 } else
381                         dssize = (dinfo->numdevs * sizeof(struct devstat)) +
382                                  sizeof(long);
383
384                 /*
385                  * Request all of the devices.  We only really allow for one
386                  * ENOMEM failure.  It would, of course, be possible to just go
387                  * in a loop and keep reallocing the device structure until we
388                  * don't get ENOMEM back.  I'm not sure it's worth it, though.
389                  * If devices are being added to the system that quickly, maybe
390                  * the user can just wait until all devices are added.
391                  */
392                 for (;;) {
393                         error = sysctlbyname("kern.devstat.all",
394                                              dinfo->mem_ptr, 
395                                              &dssize, NULL, 0);
396                         if (error != -1 || errno != EBUSY)
397                                 break;
398                 }
399                 if (error == -1) {
400                         /*
401                          * If we get ENOMEM back, that means that there are 
402                          * more devices now, so we need to allocate more 
403                          * space for the device array.
404                          */
405                         if (errno == ENOMEM) {
406                                 /*
407                                  * No need to set the error string here, 
408                                  * devstat_getnumdevs() will do that if it fails.
409                                  */
410                                 if ((dinfo->numdevs = devstat_getnumdevs(kd)) < 0)
411                                         return(-1);
412
413                                 dssize = (dinfo->numdevs * 
414                                         sizeof(struct devstat)) + sizeof(long);
415                                 dinfo->mem_ptr = (u_int8_t *)
416                                         realloc(dinfo->mem_ptr, dssize);
417                                 if ((error = sysctlbyname("kern.devstat.all", 
418                                     dinfo->mem_ptr, &dssize, NULL, 0)) == -1) {
419                                         snprintf(devstat_errbuf,
420                                                  sizeof(devstat_errbuf),
421                                                  "%s: error getting device "
422                                                  "stats\n%s: %s", __func__,
423                                                  __func__, strerror(errno));
424                                         return(-1);
425                                 }
426                         } else {
427                                 snprintf(devstat_errbuf, sizeof(devstat_errbuf),
428                                          "%s: error getting device stats\n"
429                                          "%s: %s", __func__, __func__,
430                                          strerror(errno));
431                                 return(-1);
432                         }
433                 } 
434
435         } else {
436                 /* 
437                  * This is of course non-atomic, but since we are working
438                  * on a core dump, the generation is unlikely to change
439                  */
440                 if ((dinfo->numdevs = devstat_getnumdevs(kd)) == -1)
441                         return(-1);
442                 if ((dinfo->mem_ptr = (u_int8_t *)get_devstat_kvm(kd)) == NULL)
443                         return(-1);
444         }
445         /*
446          * The sysctl spits out the generation as the first four bytes,
447          * then all of the device statistics structures.
448          */
449         dinfo->generation = *(long *)dinfo->mem_ptr;
450
451         /*
452          * If the generation has changed, and if the current number of
453          * devices is not the same as the number of devices recorded in the
454          * devinfo structure, it is likely that the device list has shrunk.
455          * The reason that it is likely that the device list has shrunk in
456          * this case is that if the device list has grown, the sysctl above
457          * will return an ENOMEM error, and we will reset the number of
458          * devices and reallocate the device array.  If the second sysctl
459          * fails, we will return an error and therefore never get to this
460          * point.  If the device list has shrunk, the sysctl will not
461          * return an error since we have more space allocated than is
462          * necessary.  So, in the shrinkage case, we catch it here and
463          * reallocate the array so that we don't use any more space than is
464          * necessary.
465          */
466         if (oldgeneration != dinfo->generation) {
467                 if (devstat_getnumdevs(kd) != dinfo->numdevs) {
468                         if ((dinfo->numdevs = devstat_getnumdevs(kd)) < 0)
469                                 return(-1);
470                         dssize = (dinfo->numdevs * sizeof(struct devstat)) +
471                                 sizeof(long);
472                         dinfo->mem_ptr = (u_int8_t *)realloc(dinfo->mem_ptr,
473                                                              dssize);
474                 }
475                 retval = 1;
476         }
477
478         dinfo->devices = (struct devstat *)(dinfo->mem_ptr + sizeof(long));
479
480         return(retval);
481 }
482
483 /*
484  * selectdevs():
485  *
486  * Devices are selected/deselected based upon the following criteria:
487  * - devices specified by the user on the command line
488  * - devices matching any device type expressions given on the command line
489  * - devices with the highest I/O, if 'top' mode is enabled
490  * - the first n unselected devices in the device list, if maxshowdevs
491  *   devices haven't already been selected and if the user has not
492  *   specified any devices on the command line and if we're in "add" mode.
493  *
494  * Input parameters:
495  * - device selection list (dev_select)
496  * - current number of devices selected (num_selected)
497  * - total number of devices in the selection list (num_selections)
498  * - devstat generation as of the last time selectdevs() was called
499  *   (select_generation)
500  * - current devstat generation (current_generation)
501  * - current list of devices and statistics (devices)
502  * - number of devices in the current device list (numdevs)
503  * - compiled version of the command line device type arguments (matches)
504  *   - This is optional.  If the number of devices is 0, this will be ignored.
505  *   - The matching code pays attention to the current selection mode.  So
506  *     if you pass in a matching expression, it will be evaluated based
507  *     upon the selection mode that is passed in.  See below for details.
508  * - number of device type matching expressions (num_matches)
509  *   - Set to 0 to disable the matching code.
510  * - list of devices specified on the command line by the user (dev_selections)
511  * - number of devices selected on the command line by the user
512  *   (num_dev_selections)
513  * - Our selection mode.  There are four different selection modes:
514  *      - add mode.  (DS_SELECT_ADD) Any devices matching devices explicitly
515  *        selected by the user or devices matching a pattern given by the
516  *        user will be selected in addition to devices that are already
517  *        selected.  Additional devices will be selected, up to maxshowdevs
518  *        number of devices. 
519  *      - only mode. (DS_SELECT_ONLY)  Only devices matching devices
520  *        explicitly given by the user or devices matching a pattern
521  *        given by the user will be selected.  No other devices will be
522  *        selected.
523  *      - addonly mode.  (DS_SELECT_ADDONLY)  This is similar to add and
524  *        only.  Basically, this will not de-select any devices that are
525  *        current selected, as only mode would, but it will also not
526  *        gratuitously select up to maxshowdevs devices as add mode would.
527  *      - remove mode.  (DS_SELECT_REMOVE)  Any devices matching devices
528  *        explicitly selected by the user or devices matching a pattern
529  *        given by the user will be de-selected.
530  * - maximum number of devices we can select (maxshowdevs)
531  * - flag indicating whether or not we're in 'top' mode (perf_select)
532  *
533  * Output data:
534  * - the device selection list may be modified and passed back out
535  * - the number of devices selected and the total number of items in the
536  *   device selection list may be changed
537  * - the selection generation may be changed to match the current generation
538  * 
539  * Return values:
540  * -1  -- error
541  *  0  -- selected devices are unchanged
542  *  1  -- selected devices changed
543  */
544 int
545 devstat_selectdevs(struct device_selection **dev_select, int *num_selected,
546                    int *num_selections, long *select_generation, 
547                    long current_generation, struct devstat *devices,
548                    int numdevs, struct devstat_match *matches, int num_matches,
549                    char **dev_selections, int num_dev_selections,
550                    devstat_select_mode select_mode, int maxshowdevs,
551                    int perf_select)
552 {
553         int i, j, k;
554         int init_selections = 0, init_selected_var = 0;
555         struct device_selection *old_dev_select = NULL;
556         int old_num_selections = 0, old_num_selected;
557         int selection_number = 0;
558         int changed = 0, found = 0;
559
560         if ((dev_select == NULL) || (devices == NULL) || (numdevs < 0))
561                 return(-1);
562
563         /*
564          * We always want to make sure that we have as many dev_select
565          * entries as there are devices. 
566          */
567         /*
568          * In this case, we haven't selected devices before.
569          */
570         if (*dev_select == NULL) {
571                 *dev_select = (struct device_selection *)malloc(numdevs *
572                         sizeof(struct device_selection));
573                 *select_generation = current_generation;
574                 init_selections = 1;
575                 changed = 1;
576         /*
577          * In this case, we have selected devices before, but the device
578          * list has changed since we last selected devices, so we need to
579          * either enlarge or reduce the size of the device selection list.
580          */
581         } else if (*num_selections != numdevs) {
582                 *dev_select = (struct device_selection *)reallocf(*dev_select,
583                         numdevs * sizeof(struct device_selection));
584                 *select_generation = current_generation;
585                 init_selections = 1;
586         /*
587          * In this case, we've selected devices before, and the selection
588          * list is the same size as it was the last time, but the device
589          * list has changed.
590          */
591         } else if (*select_generation < current_generation) {
592                 *select_generation = current_generation;
593                 init_selections = 1;
594         }
595
596         if (*dev_select == NULL) {
597                 snprintf(devstat_errbuf, sizeof(devstat_errbuf),
598                          "%s: Cannot (re)allocate memory for dev_select argument",
599                          __func__);
600                 return(-1);
601         }
602
603         /*
604          * If we're in "only" mode, we want to clear out the selected
605          * variable since we're going to select exactly what the user wants
606          * this time through.
607          */
608         if (select_mode == DS_SELECT_ONLY)
609                 init_selected_var = 1;
610
611         /*
612          * In all cases, we want to back up the number of selected devices.
613          * It is a quick and accurate way to determine whether the selected
614          * devices have changed.
615          */
616         old_num_selected = *num_selected;
617
618         /*
619          * We want to make a backup of the current selection list if 
620          * the list of devices has changed, or if we're in performance 
621          * selection mode.  In both cases, we don't want to make a backup
622          * if we already know for sure that the list will be different.
623          * This is certainly the case if this is our first time through the
624          * selection code.
625          */
626         if (((init_selected_var != 0) || (init_selections != 0)
627          || (perf_select != 0)) && (changed == 0)){
628                 old_dev_select = (struct device_selection *)malloc(
629                     *num_selections * sizeof(struct device_selection));
630                 if (old_dev_select == NULL) {
631                         snprintf(devstat_errbuf, sizeof(devstat_errbuf),
632                                  "%s: Cannot allocate memory for selection list backup",
633                                  __func__);
634                         return(-1);
635                 }
636                 old_num_selections = *num_selections;
637                 bcopy(*dev_select, old_dev_select, 
638                     sizeof(struct device_selection) * *num_selections);
639         }
640
641         if (init_selections != 0) {
642                 bzero(*dev_select, sizeof(struct device_selection) * numdevs);
643
644                 for (i = 0; i < numdevs; i++) {
645                         (*dev_select)[i].device_number = 
646                                 devices[i].device_number;
647                         strncpy((*dev_select)[i].device_name,
648                                 devices[i].device_name,
649                                 DEVSTAT_NAME_LEN);
650                         (*dev_select)[i].device_name[DEVSTAT_NAME_LEN - 1]='\0';
651                         (*dev_select)[i].unit_number = devices[i].unit_number;
652                         (*dev_select)[i].position = i;
653                 }
654                 *num_selections = numdevs;
655         } else if (init_selected_var != 0) {
656                 for (i = 0; i < numdevs; i++) 
657                         (*dev_select)[i].selected = 0;
658         }
659
660         /* we haven't gotten around to selecting anything yet.. */
661         if ((select_mode == DS_SELECT_ONLY) || (init_selections != 0)
662          || (init_selected_var != 0))
663                 *num_selected = 0;
664
665         /*
666          * Look through any devices the user specified on the command line
667          * and see if they match known devices.  If so, select them.
668          */
669         for (i = 0; (i < *num_selections) && (num_dev_selections > 0); i++) {
670                 char tmpstr[80];
671
672                 snprintf(tmpstr, sizeof(tmpstr), "%s%d",
673                          (*dev_select)[i].device_name,
674                          (*dev_select)[i].unit_number);
675                 for (j = 0; j < num_dev_selections; j++) {
676                         if (strcmp(tmpstr, dev_selections[j]) == 0) {
677                                 /*
678                                  * Here we do different things based on the
679                                  * mode we're in.  If we're in add or
680                                  * addonly mode, we only select this device
681                                  * if it hasn't already been selected.
682                                  * Otherwise, we would be unnecessarily
683                                  * changing the selection order and
684                                  * incrementing the selection count.  If
685                                  * we're in only mode, we unconditionally
686                                  * select this device, since in only mode
687                                  * any previous selections are erased and
688                                  * manually specified devices are the first
689                                  * ones to be selected.  If we're in remove
690                                  * mode, we de-select the specified device and
691                                  * decrement the selection count.
692                                  */
693                                 switch(select_mode) {
694                                 case DS_SELECT_ADD:
695                                 case DS_SELECT_ADDONLY:
696                                         if ((*dev_select)[i].selected)
697                                                 break;
698                                         /* FALLTHROUGH */
699                                 case DS_SELECT_ONLY:
700                                         (*dev_select)[i].selected =
701                                                 ++selection_number;
702                                         (*num_selected)++;
703                                         break;
704                                 case DS_SELECT_REMOVE:
705                                         (*dev_select)[i].selected = 0;
706                                         (*num_selected)--;
707                                         /*
708                                          * This isn't passed back out, we
709                                          * just use it to keep track of
710                                          * how many devices we've removed.
711                                          */
712                                         num_dev_selections--;
713                                         break;
714                                 }
715                                 break;
716                         }
717                 }
718         }
719
720         /*
721          * Go through the user's device type expressions and select devices
722          * accordingly.  We only do this if the number of devices already
723          * selected is less than the maximum number we can show.
724          */
725         for (i = 0; (i < num_matches) && (*num_selected < maxshowdevs); i++) {
726                 /* We should probably indicate some error here */
727                 if ((matches[i].match_fields == DEVSTAT_MATCH_NONE)
728                  || (matches[i].num_match_categories <= 0))
729                         continue;
730
731                 for (j = 0; j < numdevs; j++) {
732                         int num_match_categories;
733
734                         num_match_categories = matches[i].num_match_categories;
735
736                         /*
737                          * Determine whether or not the current device
738                          * matches the given matching expression.  This if
739                          * statement consists of three components:
740                          *   - the device type check
741                          *   - the device interface check
742                          *   - the passthrough check
743                          * If a the matching test is successful, it 
744                          * decrements the number of matching categories,
745                          * and if we've reached the last element that
746                          * needed to be matched, the if statement succeeds.
747                          * 
748                          */
749                         if ((((matches[i].match_fields & DEVSTAT_MATCH_TYPE)!=0)
750                           && ((devices[j].device_type & DEVSTAT_TYPE_MASK) ==
751                                 (matches[i].device_type & DEVSTAT_TYPE_MASK))
752                           &&(((matches[i].match_fields & DEVSTAT_MATCH_PASS)!=0)
753                            || (((matches[i].match_fields & 
754                                 DEVSTAT_MATCH_PASS) == 0)
755                             && ((devices[j].device_type &
756                                 DEVSTAT_TYPE_PASS) == 0)))
757                           && (--num_match_categories == 0)) 
758                          || (((matches[i].match_fields & DEVSTAT_MATCH_IF) != 0)
759                           && ((devices[j].device_type & DEVSTAT_TYPE_IF_MASK) ==
760                                 (matches[i].device_type & DEVSTAT_TYPE_IF_MASK))
761                           &&(((matches[i].match_fields & DEVSTAT_MATCH_PASS)!=0)
762                            || (((matches[i].match_fields &
763                                 DEVSTAT_MATCH_PASS) == 0)
764                             && ((devices[j].device_type & 
765                                 DEVSTAT_TYPE_PASS) == 0)))
766                           && (--num_match_categories == 0))
767                          || (((matches[i].match_fields & DEVSTAT_MATCH_PASS)!=0)
768                           && ((devices[j].device_type & DEVSTAT_TYPE_PASS) != 0)
769                           && (--num_match_categories == 0))) {
770
771                                 /*
772                                  * This is probably a non-optimal solution
773                                  * to the problem that the devices in the
774                                  * device list will not be in the same
775                                  * order as the devices in the selection
776                                  * array.
777                                  */
778                                 for (k = 0; k < numdevs; k++) {
779                                         if ((*dev_select)[k].position == j) {
780                                                 found = 1;
781                                                 break;
782                                         }
783                                 }
784
785                                 /*
786                                  * There shouldn't be a case where a device
787                                  * in the device list is not in the
788                                  * selection list...but it could happen.
789                                  */
790                                 if (found != 1) {
791                                         fprintf(stderr, "selectdevs: couldn't"
792                                                 " find %s%d in selection "
793                                                 "list\n",
794                                                 devices[j].device_name,
795                                                 devices[j].unit_number);
796                                         break;
797                                 }
798
799                                 /*
800                                  * We do different things based upon the
801                                  * mode we're in.  If we're in add or only
802                                  * mode, we go ahead and select this device
803                                  * if it hasn't already been selected.  If
804                                  * it has already been selected, we leave
805                                  * it alone so we don't mess up the
806                                  * selection ordering.  Manually specified
807                                  * devices have already been selected, and
808                                  * they have higher priority than pattern
809                                  * matched devices.  If we're in remove
810                                  * mode, we de-select the given device and
811                                  * decrement the selected count.
812                                  */
813                                 switch(select_mode) {
814                                 case DS_SELECT_ADD:
815                                 case DS_SELECT_ADDONLY:
816                                 case DS_SELECT_ONLY:
817                                         if ((*dev_select)[k].selected != 0)
818                                                 break;
819                                         (*dev_select)[k].selected =
820                                                 ++selection_number;
821                                         (*num_selected)++;
822                                         break;
823                                 case DS_SELECT_REMOVE:
824                                         (*dev_select)[k].selected = 0;
825                                         (*num_selected)--;
826                                         break;
827                                 }
828                         }
829                 }
830         }
831
832         /*
833          * Here we implement "top" mode.  Devices are sorted in the
834          * selection array based on two criteria:  whether or not they are
835          * selected (not selection number, just the fact that they are
836          * selected!) and the number of bytes in the "bytes" field of the
837          * selection structure.  The bytes field generally must be kept up
838          * by the user.  In the future, it may be maintained by library
839          * functions, but for now the user has to do the work.
840          *
841          * At first glance, it may seem wrong that we don't go through and
842          * select every device in the case where the user hasn't specified
843          * any devices or patterns.  In fact, though, it won't make any
844          * difference in the device sorting.  In that particular case (i.e.
845          * when we're in "add" or "only" mode, and the user hasn't
846          * specified anything) the first time through no devices will be
847          * selected, so the only criterion used to sort them will be their
848          * performance.  The second time through, and every time thereafter,
849          * all devices will be selected, so again selection won't matter.
850          */
851         if (perf_select != 0) {
852
853                 /* Sort the device array by throughput  */
854                 qsort(*dev_select, *num_selections,
855                       sizeof(struct device_selection),
856                       compare_select);
857
858                 if (*num_selected == 0) {
859                         /*
860                          * Here we select every device in the array, if it
861                          * isn't already selected.  Because the 'selected'
862                          * variable in the selection array entries contains
863                          * the selection order, the devstats routine can show
864                          * the devices that were selected first.
865                          */
866                         for (i = 0; i < *num_selections; i++) {
867                                 if ((*dev_select)[i].selected == 0) {
868                                         (*dev_select)[i].selected =
869                                                 ++selection_number;
870                                         (*num_selected)++;
871                                 }
872                         }
873                 } else {
874                         selection_number = 0;
875                         for (i = 0; i < *num_selections; i++) {
876                                 if ((*dev_select)[i].selected != 0) {
877                                         (*dev_select)[i].selected =
878                                                 ++selection_number;
879                                 }
880                         }
881                 }
882         }
883
884         /*
885          * If we're in the "add" selection mode and if we haven't already
886          * selected maxshowdevs number of devices, go through the array and
887          * select any unselected devices.  If we're in "only" mode, we
888          * obviously don't want to select anything other than what the user
889          * specifies.  If we're in "remove" mode, it probably isn't a good
890          * idea to go through and select any more devices, since we might
891          * end up selecting something that the user wants removed.  Through
892          * more complicated logic, we could actually figure this out, but
893          * that would probably require combining this loop with the various
894          * selections loops above.
895          */
896         if ((select_mode == DS_SELECT_ADD) && (*num_selected < maxshowdevs)) {
897                 for (i = 0; i < *num_selections; i++)
898                         if ((*dev_select)[i].selected == 0) {
899                                 (*dev_select)[i].selected = ++selection_number;
900                                 (*num_selected)++;
901                         }
902         }
903
904         /*
905          * Look at the number of devices that have been selected.  If it
906          * has changed, set the changed variable.  Otherwise, if we've
907          * made a backup of the selection list, compare it to the current
908          * selection list to see if the selected devices have changed.
909          */
910         if ((changed == 0) && (old_num_selected != *num_selected))
911                 changed = 1;
912         else if ((changed == 0) && (old_dev_select != NULL)) {
913                 /*
914                  * Now we go through the selection list and we look at
915                  * it three different ways.
916                  */
917                 for (i = 0; (i < *num_selections) && (changed == 0) && 
918                      (i < old_num_selections); i++) {
919                         /*
920                          * If the device at index i in both the new and old
921                          * selection arrays has the same device number and
922                          * selection status, it hasn't changed.  We
923                          * continue on to the next index.
924                          */
925                         if (((*dev_select)[i].device_number ==
926                              old_dev_select[i].device_number)
927                          && ((*dev_select)[i].selected == 
928                              old_dev_select[i].selected))
929                                 continue;
930
931                         /*
932                          * Now, if we're still going through the if
933                          * statement, the above test wasn't true.  So we
934                          * check here to see if the device at index i in
935                          * the current array is the same as the device at
936                          * index i in the old array.  If it is, that means
937                          * that its selection number has changed.  Set
938                          * changed to 1 and exit the loop.
939                          */
940                         else if ((*dev_select)[i].device_number ==
941                                   old_dev_select[i].device_number) {
942                                 changed = 1;
943                                 break;
944                         }
945                         /*
946                          * If we get here, then the device at index i in
947                          * the current array isn't the same device as the
948                          * device at index i in the old array.
949                          */
950                         else {
951                                 found = 0;
952
953                                 /*
954                                  * Search through the old selection array
955                                  * looking for a device with the same
956                                  * device number as the device at index i
957                                  * in the current array.  If the selection
958                                  * status is the same, then we mark it as
959                                  * found.  If the selection status isn't
960                                  * the same, we break out of the loop.
961                                  * Since found isn't set, changed will be
962                                  * set to 1 below.
963                                  */
964                                 for (j = 0; j < old_num_selections; j++) {
965                                         if (((*dev_select)[i].device_number ==
966                                               old_dev_select[j].device_number)
967                                          && ((*dev_select)[i].selected ==
968                                               old_dev_select[j].selected)){
969                                                 found = 1;
970                                                 break;
971                                         }
972                                         else if ((*dev_select)[i].device_number
973                                             == old_dev_select[j].device_number)
974                                                 break;
975                                 }
976                                 if (found == 0)
977                                         changed = 1;
978                         }
979                 }
980         }
981         if (old_dev_select != NULL)
982                 free(old_dev_select);
983
984         return(changed);
985 }
986
987 /*
988  * Comparison routine for qsort() above.  Note that the comparison here is
989  * backwards -- generally, it should return a value to indicate whether
990  * arg1 is <, =, or > arg2.  Instead, it returns the opposite.  The reason
991  * it returns the opposite is so that the selection array will be sorted in
992  * order of decreasing performance.  We sort on two parameters.  The first
993  * sort key is whether or not one or the other of the devices in question
994  * has been selected.  If one of them has, and the other one has not, the
995  * selected device is automatically more important than the unselected
996  * device.  If neither device is selected, we judge the devices based upon
997  * performance.
998  */
999 static int
1000 compare_select(const void *arg1, const void *arg2)
1001 {
1002         if ((((const struct device_selection *)arg1)->selected)
1003          && (((const struct device_selection *)arg2)->selected == 0))
1004                 return(-1);
1005         else if ((((const struct device_selection *)arg1)->selected == 0)
1006               && (((const struct device_selection *)arg2)->selected))
1007                 return(1);
1008         else if (((const struct device_selection *)arg2)->bytes <
1009                  ((const struct device_selection *)arg1)->bytes)
1010                 return(-1);
1011         else if (((const struct device_selection *)arg2)->bytes >
1012                  ((const struct device_selection *)arg1)->bytes)
1013                 return(1);
1014         else
1015                 return(0);
1016 }
1017
1018 /*
1019  * Take a string with the general format "arg1,arg2,arg3", and build a
1020  * device matching expression from it.
1021  */
1022 int
1023 devstat_buildmatch(char *match_str, struct devstat_match **matches,
1024                    int *num_matches)
1025 {
1026         char *tstr[5];
1027         char **tempstr;
1028         int num_args;
1029         int i, j;
1030
1031         /* We can't do much without a string to parse */
1032         if (match_str == NULL) {
1033                 snprintf(devstat_errbuf, sizeof(devstat_errbuf),
1034                          "%s: no match expression", __func__);
1035                 return(-1);
1036         }
1037
1038         /*
1039          * Break the (comma delimited) input string out into separate strings.
1040          */
1041         for (tempstr = tstr, num_args  = 0; 
1042              (*tempstr = strsep(&match_str, ",")) != NULL && (num_args < 5);)
1043                 if (**tempstr != '\0') {
1044                         num_args++;
1045                         if (++tempstr >= &tstr[5])
1046                                 break;
1047                 }
1048
1049         /* The user gave us too many type arguments */
1050         if (num_args > 3) {
1051                 snprintf(devstat_errbuf, sizeof(devstat_errbuf),
1052                          "%s: too many type arguments", __func__);
1053                 return(-1);
1054         }
1055
1056         if (*num_matches == 0)
1057                 *matches = NULL;
1058
1059         *matches = (struct devstat_match *)reallocf(*matches,
1060                   sizeof(struct devstat_match) * (*num_matches + 1));
1061
1062         if (*matches == NULL) {
1063                 snprintf(devstat_errbuf, sizeof(devstat_errbuf),
1064                          "%s: Cannot allocate memory for matches list", __func__);
1065                 return(-1);
1066         }
1067                           
1068         /* Make sure the current entry is clear */
1069         bzero(&matches[0][*num_matches], sizeof(struct devstat_match));
1070
1071         /*
1072          * Step through the arguments the user gave us and build a device
1073          * matching expression from them.
1074          */
1075         for (i = 0; i < num_args; i++) {
1076                 char *tempstr2, *tempstr3;
1077
1078                 /*
1079                  * Get rid of leading white space.
1080                  */
1081                 tempstr2 = tstr[i];
1082                 while (isspace(*tempstr2) && (*tempstr2 != '\0'))
1083                         tempstr2++;
1084
1085                 /*
1086                  * Get rid of trailing white space.
1087                  */
1088                 tempstr3 = &tempstr2[strlen(tempstr2) - 1];
1089
1090                 while ((*tempstr3 != '\0') && (tempstr3 > tempstr2)
1091                     && (isspace(*tempstr3))) {
1092                         *tempstr3 = '\0';
1093                         tempstr3--;
1094                 }
1095
1096                 /*
1097                  * Go through the match table comparing the user's
1098                  * arguments to known device types, interfaces, etc.  
1099                  */
1100                 for (j = 0; match_table[j].match_str != NULL; j++) {
1101                         /*
1102                          * We do case-insensitive matching, in case someone
1103                          * wants to enter "SCSI" instead of "scsi" or
1104                          * something like that.  Only compare as many 
1105                          * characters as are in the string in the match 
1106                          * table.  This should help if someone tries to use 
1107                          * a super-long match expression.  
1108                          */
1109                         if (strncasecmp(tempstr2, match_table[j].match_str,
1110                             strlen(match_table[j].match_str)) == 0) {
1111                                 /*
1112                                  * Make sure the user hasn't specified two
1113                                  * items of the same type, like "da" and
1114                                  * "cd".  One device cannot be both.
1115                                  */
1116                                 if (((*matches)[*num_matches].match_fields &
1117                                     match_table[j].match_field) != 0) {
1118                                         snprintf(devstat_errbuf,
1119                                                  sizeof(devstat_errbuf),
1120                                                  "%s: cannot have more than "
1121                                                  "one match item in a single "
1122                                                  "category", __func__);
1123                                         return(-1);
1124                                 }
1125                                 /*
1126                                  * If we've gotten this far, we have a
1127                                  * winner.  Set the appropriate fields in
1128                                  * the match entry.
1129                                  */
1130                                 (*matches)[*num_matches].match_fields |=
1131                                         match_table[j].match_field;
1132                                 (*matches)[*num_matches].device_type |=
1133                                         match_table[j].type;
1134                                 (*matches)[*num_matches].num_match_categories++;
1135                                 break;
1136                         }
1137                 }
1138                 /*
1139                  * We should have found a match in the above for loop.  If
1140                  * not, that means the user entered an invalid device type
1141                  * or interface.
1142                  */
1143                 if ((*matches)[*num_matches].num_match_categories != (i + 1)) {
1144                         snprintf(devstat_errbuf, sizeof(devstat_errbuf),
1145                                  "%s: unknown match item \"%s\"", __func__,
1146                                  tstr[i]);
1147                         return(-1);
1148                 }
1149         }
1150
1151         (*num_matches)++;
1152
1153         return(0);
1154 }
1155
1156 /*
1157  * Compute a number of device statistics.  Only one field is mandatory, and
1158  * that is "current".  Everything else is optional.  The caller passes in
1159  * pointers to variables to hold the various statistics he desires.  If he
1160  * doesn't want a particular staistic, he should pass in a NULL pointer.
1161  * Return values:
1162  * 0   -- success
1163  * -1  -- failure
1164  */
1165 int
1166 compute_stats(struct devstat *current, struct devstat *previous,
1167               long double etime, u_int64_t *total_bytes,
1168               u_int64_t *total_transfers, u_int64_t *total_blocks,
1169               long double *kb_per_transfer, long double *transfers_per_second,
1170               long double *mb_per_second, long double *blocks_per_second,
1171               long double *ms_per_transaction)
1172 {
1173         return(devstat_compute_statistics(current, previous, etime,
1174                total_bytes ? DSM_TOTAL_BYTES : DSM_SKIP,
1175                total_bytes,
1176                total_transfers ? DSM_TOTAL_TRANSFERS : DSM_SKIP,
1177                total_transfers,
1178                total_blocks ? DSM_TOTAL_BLOCKS : DSM_SKIP,
1179                total_blocks,
1180                kb_per_transfer ? DSM_KB_PER_TRANSFER : DSM_SKIP,
1181                kb_per_transfer,
1182                transfers_per_second ? DSM_TRANSFERS_PER_SECOND : DSM_SKIP,
1183                transfers_per_second,
1184                mb_per_second ? DSM_MB_PER_SECOND : DSM_SKIP,
1185                mb_per_second,
1186                blocks_per_second ? DSM_BLOCKS_PER_SECOND : DSM_SKIP,
1187                blocks_per_second,
1188                ms_per_transaction ? DSM_MS_PER_TRANSACTION : DSM_SKIP,
1189                ms_per_transaction,
1190                DSM_NONE));
1191 }
1192
1193
1194 /* This is 1/2^64 */
1195 #define BINTIME_SCALE 5.42101086242752217003726400434970855712890625e-20
1196
1197 long double
1198 devstat_compute_etime(struct bintime *cur_time, struct bintime *prev_time)
1199 {
1200         long double etime;
1201
1202         etime = cur_time->sec;
1203         etime += cur_time->frac * BINTIME_SCALE;
1204         if (prev_time != NULL) {
1205                 etime -= prev_time->sec;
1206                 etime -= prev_time->frac * BINTIME_SCALE;
1207         }
1208         return(etime);
1209 }
1210
1211 #define DELTA(field, index)                             \
1212         (current->field[(index)] - (previous ? previous->field[(index)] : 0))
1213
1214 #define DELTA_T(field)                                  \
1215         devstat_compute_etime(&current->field,          \
1216         (previous ? &previous->field : NULL))
1217
1218 int
1219 devstat_compute_statistics(struct devstat *current, struct devstat *previous,
1220                            long double etime, ...)
1221 {
1222         u_int64_t totalbytes, totalbytesread, totalbyteswrite, totalbytesfree;
1223         u_int64_t totaltransfers, totaltransfersread, totaltransferswrite;
1224         u_int64_t totaltransfersother, totalblocks, totalblocksread;
1225         u_int64_t totalblockswrite, totaltransfersfree, totalblocksfree;
1226         long double totalduration, totaldurationread, totaldurationwrite;
1227         long double totaldurationfree, totaldurationother;
1228         va_list ap;
1229         devstat_metric metric;
1230         u_int64_t *destu64;
1231         long double *destld;
1232         int retval;
1233
1234         retval = 0;
1235
1236         /*
1237          * current is the only mandatory field.
1238          */
1239         if (current == NULL) {
1240                 snprintf(devstat_errbuf, sizeof(devstat_errbuf),
1241                          "%s: current stats structure was NULL", __func__);
1242                 return(-1);
1243         }
1244
1245         totalbytesread = DELTA(bytes, DEVSTAT_READ);
1246         totalbyteswrite = DELTA(bytes, DEVSTAT_WRITE);
1247         totalbytesfree = DELTA(bytes, DEVSTAT_FREE);
1248         totalbytes = totalbytesread + totalbyteswrite + totalbytesfree;
1249
1250         totaltransfersread = DELTA(operations, DEVSTAT_READ);
1251         totaltransferswrite = DELTA(operations, DEVSTAT_WRITE);
1252         totaltransfersother = DELTA(operations, DEVSTAT_NO_DATA);
1253         totaltransfersfree = DELTA(operations, DEVSTAT_FREE);
1254         totaltransfers = totaltransfersread + totaltransferswrite +
1255                          totaltransfersother + totaltransfersfree;
1256
1257         totalblocks = totalbytes;
1258         totalblocksread = totalbytesread;
1259         totalblockswrite = totalbyteswrite;
1260         totalblocksfree = totalbytesfree;
1261
1262         if (current->block_size > 0) {
1263                 totalblocks /= current->block_size;
1264                 totalblocksread /= current->block_size;
1265                 totalblockswrite /= current->block_size;
1266                 totalblocksfree /= current->block_size;
1267         } else {
1268                 totalblocks /= 512;
1269                 totalblocksread /= 512;
1270                 totalblockswrite /= 512;
1271                 totalblocksfree /= 512;
1272         }
1273
1274         totaldurationread = DELTA_T(duration[DEVSTAT_READ]);
1275         totaldurationwrite = DELTA_T(duration[DEVSTAT_WRITE]);
1276         totaldurationfree = DELTA_T(duration[DEVSTAT_FREE]);
1277         totaldurationother = DELTA_T(duration[DEVSTAT_NO_DATA]);
1278         totalduration = totaldurationread + totaldurationwrite +
1279             totaldurationfree + totaldurationother;
1280
1281         va_start(ap, etime);
1282
1283         while ((metric = (devstat_metric)va_arg(ap, devstat_metric)) != 0) {
1284
1285                 if (metric == DSM_NONE)
1286                         break;
1287
1288                 if (metric >= DSM_MAX) {
1289                         snprintf(devstat_errbuf, sizeof(devstat_errbuf),
1290                                  "%s: metric %d is out of range", __func__,
1291                                  metric);
1292                         retval = -1;
1293                         goto bailout;
1294                 }
1295
1296                 switch (devstat_arg_list[metric].argtype) {
1297                 case DEVSTAT_ARG_UINT64:
1298                         destu64 = (u_int64_t *)va_arg(ap, u_int64_t *);
1299                         break;
1300                 case DEVSTAT_ARG_LD:
1301                         destld = (long double *)va_arg(ap, long double *);
1302                         break;
1303                 case DEVSTAT_ARG_SKIP:
1304                         destld = (long double *)va_arg(ap, long double *);
1305                         break;
1306                 default:
1307                         retval = -1;
1308                         goto bailout;
1309                         break; /* NOTREACHED */
1310                 }
1311
1312                 if (devstat_arg_list[metric].argtype == DEVSTAT_ARG_SKIP)
1313                         continue;
1314
1315                 switch (metric) {
1316                 case DSM_TOTAL_BYTES:
1317                         *destu64 = totalbytes;
1318                         break;
1319                 case DSM_TOTAL_BYTES_READ:
1320                         *destu64 = totalbytesread;
1321                         break;
1322                 case DSM_TOTAL_BYTES_WRITE:
1323                         *destu64 = totalbyteswrite;
1324                         break;
1325                 case DSM_TOTAL_BYTES_FREE:
1326                         *destu64 = totalbytesfree;
1327                         break;
1328                 case DSM_TOTAL_TRANSFERS:
1329                         *destu64 = totaltransfers;
1330                         break;
1331                 case DSM_TOTAL_TRANSFERS_READ:
1332                         *destu64 = totaltransfersread;
1333                         break;
1334                 case DSM_TOTAL_TRANSFERS_WRITE:
1335                         *destu64 = totaltransferswrite;
1336                         break;
1337                 case DSM_TOTAL_TRANSFERS_FREE:
1338                         *destu64 = totaltransfersfree;
1339                         break;
1340                 case DSM_TOTAL_TRANSFERS_OTHER:
1341                         *destu64 = totaltransfersother;
1342                         break;
1343                 case DSM_TOTAL_BLOCKS:
1344                         *destu64 = totalblocks;
1345                         break;
1346                 case DSM_TOTAL_BLOCKS_READ:
1347                         *destu64 = totalblocksread;
1348                         break;
1349                 case DSM_TOTAL_BLOCKS_WRITE:
1350                         *destu64 = totalblockswrite;
1351                         break;
1352                 case DSM_TOTAL_BLOCKS_FREE:
1353                         *destu64 = totalblocksfree;
1354                         break;
1355                 case DSM_KB_PER_TRANSFER:
1356                         *destld = totalbytes;
1357                         *destld /= 1024;
1358                         if (totaltransfers > 0)
1359                                 *destld /= totaltransfers;
1360                         else
1361                                 *destld = 0.0;
1362                         break;
1363                 case DSM_KB_PER_TRANSFER_READ:
1364                         *destld = totalbytesread;
1365                         *destld /= 1024;
1366                         if (totaltransfersread > 0)
1367                                 *destld /= totaltransfersread;
1368                         else
1369                                 *destld = 0.0;
1370                         break;
1371                 case DSM_KB_PER_TRANSFER_WRITE:
1372                         *destld = totalbyteswrite;
1373                         *destld /= 1024;
1374                         if (totaltransferswrite > 0)
1375                                 *destld /= totaltransferswrite;
1376                         else
1377                                 *destld = 0.0;
1378                         break;
1379                 case DSM_KB_PER_TRANSFER_FREE:
1380                         *destld = totalbytesfree;
1381                         *destld /= 1024;
1382                         if (totaltransfersfree > 0)
1383                                 *destld /= totaltransfersfree;
1384                         else
1385                                 *destld = 0.0;
1386                         break;
1387                 case DSM_TRANSFERS_PER_SECOND:
1388                         if (etime > 0.0) {
1389                                 *destld = totaltransfers;
1390                                 *destld /= etime;
1391                         } else
1392                                 *destld = 0.0;
1393                         break;
1394                 case DSM_TRANSFERS_PER_SECOND_READ:
1395                         if (etime > 0.0) {
1396                                 *destld = totaltransfersread;
1397                                 *destld /= etime;
1398                         } else
1399                                 *destld = 0.0;
1400                         break;
1401                 case DSM_TRANSFERS_PER_SECOND_WRITE:
1402                         if (etime > 0.0) {
1403                                 *destld = totaltransferswrite;
1404                                 *destld /= etime;
1405                         } else
1406                                 *destld = 0.0;
1407                         break;
1408                 case DSM_TRANSFERS_PER_SECOND_FREE:
1409                         if (etime > 0.0) {
1410                                 *destld = totaltransfersfree;
1411                                 *destld /= etime;
1412                         } else
1413                                 *destld = 0.0;
1414                         break;
1415                 case DSM_TRANSFERS_PER_SECOND_OTHER:
1416                         if (etime > 0.0) {
1417                                 *destld = totaltransfersother;
1418                                 *destld /= etime;
1419                         } else
1420                                 *destld = 0.0;
1421                         break;
1422                 case DSM_MB_PER_SECOND:
1423                         *destld = totalbytes;
1424                         *destld /= 1024 * 1024;
1425                         if (etime > 0.0)
1426                                 *destld /= etime;
1427                         else
1428                                 *destld = 0.0;
1429                         break;
1430                 case DSM_MB_PER_SECOND_READ:
1431                         *destld = totalbytesread;
1432                         *destld /= 1024 * 1024;
1433                         if (etime > 0.0)
1434                                 *destld /= etime;
1435                         else
1436                                 *destld = 0.0;
1437                         break;
1438                 case DSM_MB_PER_SECOND_WRITE:
1439                         *destld = totalbyteswrite;
1440                         *destld /= 1024 * 1024;
1441                         if (etime > 0.0)
1442                                 *destld /= etime;
1443                         else
1444                                 *destld = 0.0;
1445                         break;
1446                 case DSM_MB_PER_SECOND_FREE:
1447                         *destld = totalbytesfree;
1448                         *destld /= 1024 * 1024;
1449                         if (etime > 0.0)
1450                                 *destld /= etime;
1451                         else
1452                                 *destld = 0.0;
1453                         break;
1454                 case DSM_BLOCKS_PER_SECOND:
1455                         *destld = totalblocks;
1456                         if (etime > 0.0)
1457                                 *destld /= etime;
1458                         else
1459                                 *destld = 0.0;
1460                         break;
1461                 case DSM_BLOCKS_PER_SECOND_READ:
1462                         *destld = totalblocksread;
1463                         if (etime > 0.0)
1464                                 *destld /= etime;
1465                         else
1466                                 *destld = 0.0;
1467                         break;
1468                 case DSM_BLOCKS_PER_SECOND_WRITE:
1469                         *destld = totalblockswrite;
1470                         if (etime > 0.0)
1471                                 *destld /= etime;
1472                         else
1473                                 *destld = 0.0;
1474                         break;
1475                 case DSM_BLOCKS_PER_SECOND_FREE:
1476                         *destld = totalblocksfree;
1477                         if (etime > 0.0)
1478                                 *destld /= etime;
1479                         else
1480                                 *destld = 0.0;
1481                         break;
1482                 /*
1483                  * This calculation is somewhat bogus.  It simply divides
1484                  * the elapsed time by the total number of transactions
1485                  * completed.  While that does give the caller a good
1486                  * picture of the average rate of transaction completion,
1487                  * it doesn't necessarily give the caller a good view of
1488                  * how long transactions took to complete on average.
1489                  * Those two numbers will be different for a device that
1490                  * can handle more than one transaction at a time.  e.g.
1491                  * SCSI disks doing tagged queueing.
1492                  *
1493                  * The only way to accurately determine the real average
1494                  * time per transaction would be to compute and store the
1495                  * time on a per-transaction basis.  That currently isn't
1496                  * done in the kernel, and would only be desireable if it
1497                  * could be implemented in a somewhat non-intrusive and high
1498                  * performance way.
1499                  */
1500                 case DSM_MS_PER_TRANSACTION:
1501                         if (totaltransfers > 0) {
1502                                 *destld = totalduration;
1503                                 *destld /= totaltransfers;
1504                                 *destld *= 1000;
1505                         } else
1506                                 *destld = 0.0;
1507                         break;
1508                 /*
1509                  * As above, these next two really only give the average
1510                  * rate of completion for read and write transactions, not
1511                  * the average time the transaction took to complete.
1512                  */
1513                 case DSM_MS_PER_TRANSACTION_READ:
1514                         if (totaltransfersread > 0) {
1515                                 *destld = totaldurationread;
1516                                 *destld /= totaltransfersread;
1517                                 *destld *= 1000;
1518                         } else
1519                                 *destld = 0.0;
1520                         break;
1521                 case DSM_MS_PER_TRANSACTION_WRITE:
1522                         if (totaltransferswrite > 0) {
1523                                 *destld = totaldurationwrite;
1524                                 *destld /= totaltransferswrite;
1525                                 *destld *= 1000;
1526                         } else
1527                                 *destld = 0.0;
1528                         break;
1529                 case DSM_MS_PER_TRANSACTION_FREE:
1530                         if (totaltransfersfree > 0) {
1531                                 *destld = totaldurationfree;
1532                                 *destld /= totaltransfersfree;
1533                                 *destld *= 1000;
1534                         } else
1535                                 *destld = 0.0;
1536                         break;
1537                 case DSM_MS_PER_TRANSACTION_OTHER:
1538                         if (totaltransfersother > 0) {
1539                                 *destld = totaldurationother;
1540                                 *destld /= totaltransfersother;
1541                                 *destld *= 1000;
1542                         } else
1543                                 *destld = 0.0;
1544                         break;
1545                 case DSM_BUSY_PCT:
1546                         *destld = DELTA_T(busy_time);
1547                         if (*destld < 0)
1548                                 *destld = 0;
1549                         *destld /= etime;
1550                         *destld *= 100;
1551                         if (*destld < 0)
1552                                 *destld = 0;
1553                         break;
1554                 case DSM_QUEUE_LENGTH:
1555                         *destu64 = current->start_count - current->end_count;
1556                         break;
1557                 case DSM_TOTAL_DURATION:
1558                         *destld = totalduration;
1559                         break;
1560                 case DSM_TOTAL_DURATION_READ:
1561                         *destld = totaldurationread;
1562                         break;
1563                 case DSM_TOTAL_DURATION_WRITE:
1564                         *destld = totaldurationwrite;
1565                         break;
1566                 case DSM_TOTAL_DURATION_FREE:
1567                         *destld = totaldurationfree;
1568                         break;
1569                 case DSM_TOTAL_DURATION_OTHER:
1570                         *destld = totaldurationother;
1571                         break;
1572                 case DSM_TOTAL_BUSY_TIME:
1573                         *destld = DELTA_T(busy_time);
1574                         break;
1575 /*
1576  * XXX: comment out the default block to see if any case's are missing.
1577  */
1578 #if 1
1579                 default:
1580                         /*
1581                          * This shouldn't happen, since we should have
1582                          * caught any out of range metrics at the top of
1583                          * the loop.
1584                          */
1585                         snprintf(devstat_errbuf, sizeof(devstat_errbuf),
1586                                  "%s: unknown metric %d", __func__, metric);
1587                         retval = -1;
1588                         goto bailout;
1589                         break; /* NOTREACHED */
1590 #endif
1591                 }
1592         }
1593
1594 bailout:
1595
1596         va_end(ap);
1597         return(retval);
1598 }
1599
1600 static int 
1601 readkmem(kvm_t *kd, unsigned long addr, void *buf, size_t nbytes)
1602 {
1603
1604         if (kvm_read(kd, addr, buf, nbytes) == -1) {
1605                 snprintf(devstat_errbuf, sizeof(devstat_errbuf),
1606                          "%s: error reading value (kvm_read): %s", __func__,
1607                          kvm_geterr(kd));
1608                 return(-1);
1609         }
1610         return(0);
1611 }
1612
1613 static int
1614 readkmem_nl(kvm_t *kd, const char *name, void *buf, size_t nbytes)
1615 {
1616         struct nlist nl[2];
1617
1618         nl[0].n_name = (char *)name;
1619         nl[1].n_name = NULL;
1620
1621         if (kvm_nlist(kd, nl) == -1) {
1622                 snprintf(devstat_errbuf, sizeof(devstat_errbuf),
1623                          "%s: error getting name list (kvm_nlist): %s",
1624                          __func__, kvm_geterr(kd));
1625                 return(-1);
1626         }
1627         return(readkmem(kd, nl[0].n_value, buf, nbytes));
1628 }
1629
1630 /*
1631  * This duplicates the functionality of the kernel sysctl handler for poking
1632  * through crash dumps.
1633  */
1634 static char *
1635 get_devstat_kvm(kvm_t *kd)
1636 {
1637         int i, wp;
1638         long gen;
1639         struct devstat *nds;
1640         struct devstat ds;
1641         struct devstatlist dhead;
1642         int num_devs;
1643         char *rv = NULL;
1644
1645         if ((num_devs = devstat_getnumdevs(kd)) <= 0)
1646                 return(NULL);
1647         if (KREADNL(kd, X_DEVICE_STATQ, dhead) == -1)
1648                 return(NULL);
1649
1650         nds = STAILQ_FIRST(&dhead);
1651         
1652         if ((rv = malloc(sizeof(gen))) == NULL) {
1653                 snprintf(devstat_errbuf, sizeof(devstat_errbuf), 
1654                          "%s: out of memory (initial malloc failed)",
1655                          __func__);
1656                 return(NULL);
1657         }
1658         gen = devstat_getgeneration(kd);
1659         memcpy(rv, &gen, sizeof(gen));
1660         wp = sizeof(gen);
1661         /*
1662          * Now push out all the devices.
1663          */
1664         for (i = 0; (nds != NULL) && (i < num_devs);  
1665              nds = STAILQ_NEXT(nds, dev_links), i++) {
1666                 if (readkmem(kd, (long)nds, &ds, sizeof(ds)) == -1) {
1667                         free(rv);
1668                         return(NULL);
1669                 }
1670                 nds = &ds;
1671                 rv = (char *)reallocf(rv, sizeof(gen) + 
1672                                       sizeof(ds) * (i + 1));
1673                 if (rv == NULL) {
1674                         snprintf(devstat_errbuf, sizeof(devstat_errbuf), 
1675                                  "%s: out of memory (malloc failed)",
1676                                  __func__);
1677                         return(NULL);
1678                 }
1679                 memcpy(rv + wp, &ds, sizeof(ds));
1680                 wp += sizeof(ds);
1681         }
1682         return(rv);
1683 }