cddl/contrib/opensolaris/lib/libdtrace/common/dt_aggregate.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  */
  26
  27 /*
  28  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  29  * Copyright (c) 2012 by Delphix. All rights reserved.
  30  */
  31
  32 #include <stdlib.h>
  33 #include <strings.h>
  34 #include <errno.h>
  35 #include <unistd.h>
  36 #include <dt_impl.h>
  37 #include <assert.h>
  38 #include <dt_oformat.h>
  39 #ifdef illumos
  40 #include <alloca.h>
  41 #else
  42 #include <sys/sysctl.h>
  43 #include <libproc_compat.h>
  44 #endif
  45 #include <limits.h>
  46
  47 #define DTRACE_AHASHSIZE        32779           /* big 'ol prime */
  48
  49 /*
  50  * Because qsort(3C) does not allow an argument to be passed to a comparison
  51  * function, the variables that affect comparison must regrettably be global;
  52  * they are protected by a global static lock, dt_qsort_lock.
  53  */
  54 static pthread_mutex_t dt_qsort_lock = PTHREAD_MUTEX_INITIALIZER;
  55
  56 static int dt_revsort;
  57 static int dt_keysort;
  58 static int dt_keypos;
  59
  60 #define DT_LESSTHAN     (dt_revsort == 0 ? -1 : 1)
  61 #define DT_GREATERTHAN  (dt_revsort == 0 ? 1 : -1)
  62
  63 static void
  64 dt_aggregate_count(int64_t *existing, int64_t *new, size_t size)
  65 {
  66         uint_t i;
  67
  68         for (i = 0; i < size / sizeof (int64_t); i++)
  69                 existing[i] = existing[i] + new[i];
  70 }
  71
  72 static int
  73 dt_aggregate_countcmp(int64_t *lhs, int64_t *rhs)
  74 {
  75         int64_t lvar = *lhs;
  76         int64_t rvar = *rhs;
  77
  78         if (lvar < rvar)
  79                 return (DT_LESSTHAN);
  80
  81         if (lvar > rvar)
  82                 return (DT_GREATERTHAN);
  83
  84         return (0);
  85 }
  86
  87 /*ARGSUSED*/
  88 static void
  89 dt_aggregate_min(int64_t *existing, int64_t *new, size_t size)
  90 {
  91         if (*new < *existing)
  92                 *existing = *new;
  93 }
  94
  95 /*ARGSUSED*/
  96 static void
  97 dt_aggregate_max(int64_t *existing, int64_t *new, size_t size)
  98 {
  99         if (*new > *existing)
 100                 *existing = *new;
 101 }
 102
 103 static int
 104 dt_aggregate_averagecmp(int64_t *lhs, int64_t *rhs)
 105 {
 106         int64_t lavg = lhs[0] ? (lhs[1] / lhs[0]) : 0;
 107         int64_t ravg = rhs[0] ? (rhs[1] / rhs[0]) : 0;
 108
 109         if (lavg < ravg)
 110                 return (DT_LESSTHAN);
 111
 112         if (lavg > ravg)
 113                 return (DT_GREATERTHAN);
 114
 115         return (0);
 116 }
 117
 118 static int
 119 dt_aggregate_stddevcmp(int64_t *lhs, int64_t *rhs)
 120 {
 121         uint64_t lsd = dt_stddev((uint64_t *)lhs, 1);
 122         uint64_t rsd = dt_stddev((uint64_t *)rhs, 1);
 123
 124         if (lsd < rsd)
 125                 return (DT_LESSTHAN);
 126
 127         if (lsd > rsd)
 128                 return (DT_GREATERTHAN);
 129
 130         return (0);
 131 }
 132
 133 /*ARGSUSED*/
 134 static void
 135 dt_aggregate_lquantize(int64_t *existing, int64_t *new, size_t size)
 136 {
 137         int64_t arg = *existing++;
 138         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
 139         int i;
 140
 141         for (i = 0; i <= levels + 1; i++)
 142                 existing[i] = existing[i] + new[i + 1];
 143 }
 144
 145 static long double
 146 dt_aggregate_lquantizedsum(int64_t *lquanta)
 147 {
 148         int64_t arg = *lquanta++;
 149         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
 150         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
 151         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
 152         long double total = (long double)lquanta[0] * (long double)(base - 1);
 153
 154         for (i = 0; i < levels; base += step, i++)
 155                 total += (long double)lquanta[i + 1] * (long double)base;
 156
 157         return (total + (long double)lquanta[levels + 1] *
 158             (long double)(base + 1));
 159 }
 160
 161 static int64_t
 162 dt_aggregate_lquantizedzero(int64_t *lquanta)
 163 {
 164         int64_t arg = *lquanta++;
 165         int32_t base = DTRACE_LQUANTIZE_BASE(arg);
 166         uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
 167         uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg), i;
 168
 169         if (base - 1 == 0)
 170                 return (lquanta[0]);
 171
 172         for (i = 0; i < levels; base += step, i++) {
 173                 if (base != 0)
 174                         continue;
 175
 176                 return (lquanta[i + 1]);
 177         }
 178
 179         if (base + 1 == 0)
 180                 return (lquanta[levels + 1]);
 181
 182         return (0);
 183 }
 184
 185 static int
 186 dt_aggregate_lquantizedcmp(int64_t *lhs, int64_t *rhs)
 187 {
 188         long double lsum = dt_aggregate_lquantizedsum(lhs);
 189         long double rsum = dt_aggregate_lquantizedsum(rhs);
 190         int64_t lzero, rzero;
 191
 192         if (lsum < rsum)
 193                 return (DT_LESSTHAN);
 194
 195         if (lsum > rsum)
 196                 return (DT_GREATERTHAN);
 197
 198         /*
 199          * If they're both equal, then we will compare based on the weights at
 200          * zero.  If the weights at zero are equal (or if zero is not within
 201          * the range of the linear quantization), then this will be judged a
 202          * tie and will be resolved based on the key comparison.
 203          */
 204         lzero = dt_aggregate_lquantizedzero(lhs);
 205         rzero = dt_aggregate_lquantizedzero(rhs);
 206
 207         if (lzero < rzero)
 208                 return (DT_LESSTHAN);
 209
 210         if (lzero > rzero)
 211                 return (DT_GREATERTHAN);
 212
 213         return (0);
 214 }
 215
 216 static void
 217 dt_aggregate_llquantize(int64_t *existing, int64_t *new, size_t size)
 218 {
 219         int i;
 220
 221         for (i = 1; i < size / sizeof (int64_t); i++)
 222                 existing[i] = existing[i] + new[i];
 223 }
 224
 225 static long double
 226 dt_aggregate_llquantizedsum(int64_t *llquanta)
 227 {
 228         int64_t arg = *llquanta++;
 229         uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
 230         uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
 231         uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
 232         uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
 233         int bin = 0, order;
 234         int64_t value = 1, next, step;
 235         long double total;
 236
 237         assert(nsteps >= factor);
 238         assert(nsteps % factor == 0);
 239
 240         for (order = 0; order < low; order++)
 241                 value *= factor;
 242
 243         total = (long double)llquanta[bin++] * (long double)(value - 1);
 244
 245         next = value * factor;
 246         step = next > nsteps ? next / nsteps : 1;
 247
 248         while (order <= high) {
 249                 assert(value < next);
 250                 total += (long double)llquanta[bin++] * (long double)(value);
 251
 252                 if ((value += step) != next)
 253                         continue;
 254
 255                 next = value * factor;
 256                 step = next > nsteps ? next / nsteps : 1;
 257                 order++;
 258         }
 259
 260         return (total + (long double)llquanta[bin] * (long double)value);
 261 }
 262
 263 static int
 264 dt_aggregate_llquantizedcmp(int64_t *lhs, int64_t *rhs)
 265 {
 266         long double lsum = dt_aggregate_llquantizedsum(lhs);
 267         long double rsum = dt_aggregate_llquantizedsum(rhs);
 268         int64_t lzero, rzero;
 269
 270         if (lsum < rsum)
 271                 return (DT_LESSTHAN);
 272
 273         if (lsum > rsum)
 274                 return (DT_GREATERTHAN);
 275
 276         /*
 277          * If they're both equal, then we will compare based on the weights at
 278          * zero.  If the weights at zero are equal, then this will be judged a
 279          * tie and will be resolved based on the key comparison.
 280          */
 281         lzero = lhs[1];
 282         rzero = rhs[1];
 283
 284         if (lzero < rzero)
 285                 return (DT_LESSTHAN);
 286
 287         if (lzero > rzero)
 288                 return (DT_GREATERTHAN);
 289
 290         return (0);
 291 }
 292
 293 static int
 294 dt_aggregate_quantizedcmp(int64_t *lhs, int64_t *rhs)
 295 {
 296         int nbuckets = DTRACE_QUANTIZE_NBUCKETS;
 297         long double ltotal = 0, rtotal = 0;
 298         int64_t lzero, rzero;
 299         uint_t i;
 300
 301         for (i = 0; i < nbuckets; i++) {
 302                 int64_t bucketval = DTRACE_QUANTIZE_BUCKETVAL(i);
 303
 304                 if (bucketval == 0) {
 305                         lzero = lhs[i];
 306                         rzero = rhs[i];
 307                 }
 308
 309                 ltotal += (long double)bucketval * (long double)lhs[i];
 310                 rtotal += (long double)bucketval * (long double)rhs[i];
 311         }
 312
 313         if (ltotal < rtotal)
 314                 return (DT_LESSTHAN);
 315
 316         if (ltotal > rtotal)
 317                 return (DT_GREATERTHAN);
 318
 319         /*
 320          * If they're both equal, then we will compare based on the weights at
 321          * zero.  If the weights at zero are equal, then this will be judged a
 322          * tie and will be resolved based on the key comparison.
 323          */
 324         if (lzero < rzero)
 325                 return (DT_LESSTHAN);
 326
 327         if (lzero > rzero)
 328                 return (DT_GREATERTHAN);
 329
 330         return (0);
 331 }
 332
 333 static void
 334 dt_aggregate_usym(dtrace_hdl_t *dtp, uint64_t *data)
 335 {
 336         uint64_t pid = data[0];
 337         uint64_t *pc = &data[1];
 338         struct ps_prochandle *P;
 339         GElf_Sym sym;
 340
 341         if (dtp->dt_vector != NULL)
 342                 return;
 343
 344         if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
 345                 return;
 346
 347         dt_proc_lock(dtp, P);
 348
 349         if (Plookup_by_addr(P, *pc, NULL, 0, &sym) == 0)
 350                 *pc = sym.st_value;
 351
 352         dt_proc_unlock(dtp, P);
 353         dt_proc_release(dtp, P);
 354 }
 355
 356 static void
 357 dt_aggregate_umod(dtrace_hdl_t *dtp, uint64_t *data)
 358 {
 359         uint64_t pid = data[0];
 360         uint64_t *pc = &data[1];
 361         struct ps_prochandle *P;
 362         const prmap_t *map;
 363
 364         if (dtp->dt_vector != NULL)
 365                 return;
 366
 367         if ((P = dt_proc_grab(dtp, pid, PGRAB_RDONLY | PGRAB_FORCE, 0)) == NULL)
 368                 return;
 369
 370         dt_proc_lock(dtp, P);
 371
 372         if ((map = Paddr_to_map(P, *pc)) != NULL)
 373                 *pc = map->pr_vaddr;
 374
 375         dt_proc_unlock(dtp, P);
 376         dt_proc_release(dtp, P);
 377 }
 378
 379 static void
 380 dt_aggregate_sym(dtrace_hdl_t *dtp, uint64_t *data)
 381 {
 382         GElf_Sym sym;
 383         uint64_t *pc = data;
 384
 385         if (dtrace_lookup_by_addr(dtp, *pc, &sym, NULL) == 0)
 386                 *pc = sym.st_value;
 387 }
 388
 389 static void
 390 dt_aggregate_mod(dtrace_hdl_t *dtp, uint64_t *data)
 391 {
 392         uint64_t *pc = data;
 393         dt_module_t *dmp;
 394
 395         if (dtp->dt_vector != NULL) {
 396                 /*
 397                  * We don't have a way of just getting the module for a
 398                  * vectored open, and it doesn't seem to be worth defining
 399                  * one.  This means that use of mod() won't get true
 400                  * aggregation in the postmortem case (some modules may
 401                  * appear more than once in aggregation output).  It seems
 402                  * unlikely that anyone will ever notice or care...
 403                  */
 404                 return;
 405         }
 406
 407         for (dmp = dt_list_next(&dtp->dt_modlist); dmp != NULL;
 408             dmp = dt_list_next(dmp)) {
 409                 if (*pc - dmp->dm_text_va < dmp->dm_text_size) {
 410                         *pc = dmp->dm_text_va;
 411                         return;
 412                 }
 413         }
 414 }
 415
 416 static dtrace_aggvarid_t
 417 dt_aggregate_aggvarid(dt_ahashent_t *ent)
 418 {
 419         dtrace_aggdesc_t *agg = ent->dtahe_data.dtada_desc;
 420         caddr_t data = ent->dtahe_data.dtada_data;
 421         dtrace_recdesc_t *rec = agg->dtagd_rec;
 422
 423         /*
 424          * First, we'll check the variable ID in the aggdesc.  If it's valid,
 425          * we'll return it.  If not, we'll use the compiler-generated ID
 426          * present as the first record.
 427          */
 428         if (agg->dtagd_varid != DTRACE_AGGVARIDNONE)
 429                 return (agg->dtagd_varid);
 430
 431         agg->dtagd_varid = *((dtrace_aggvarid_t *)(uintptr_t)(data +
 432             rec->dtrd_offset));
 433
 434         return (agg->dtagd_varid);
 435 }
 436
 437
 438 static int
 439 dt_aggregate_snap_cpu(dtrace_hdl_t *dtp, processorid_t cpu)
 440 {
 441         dtrace_epid_t id;
 442         uint64_t hashval;
 443         size_t offs, roffs, size, ndx;
 444         int i, j, rval;
 445         caddr_t addr, data;
 446         dtrace_recdesc_t *rec;
 447         dt_aggregate_t *agp = &dtp->dt_aggregate;
 448         dtrace_aggdesc_t *agg;
 449         dt_ahash_t *hash = &agp->dtat_hash;
 450         dt_ahashent_t *h;
 451         dtrace_bufdesc_t b = agp->dtat_buf, *buf = &b;
 452         dtrace_aggdata_t *aggdata;
 453         int flags = agp->dtat_flags;
 454
 455         buf->dtbd_cpu = cpu;
 456
 457 #ifdef illumos
 458         if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, buf) == -1) {
 459 #else
 460         if (dt_ioctl(dtp, DTRACEIOC_AGGSNAP, &buf) == -1) {
 461 #endif
 462                 if (errno == ENOENT) {
 463                         /*
 464                          * If that failed with ENOENT, it may be because the
 465                          * CPU was unconfigured.  This is okay; we'll just
 466                          * do nothing but return success.
 467                          */
 468                         return (0);
 469                 }
 470
 471                 return (dt_set_errno(dtp, errno));
 472         }
 473
 474         if (buf->dtbd_drops != 0) {
 475                 xo_open_instance("probes");
 476                 dt_oformat_drop(dtp, cpu);
 477                 if (dt_handle_cpudrop(dtp, cpu,
 478                     DTRACEDROP_AGGREGATION, buf->dtbd_drops) == -1) {
 479                         xo_close_instance("probes");
 480                         return (-1);
 481                 }
 482                 xo_close_instance("probes");
 483         }
 484
 485         if (buf->dtbd_size == 0)
 486                 return (0);
 487
 488         if (hash->dtah_hash == NULL) {
 489                 size_t size;
 490
 491                 hash->dtah_size = DTRACE_AHASHSIZE;
 492                 size = hash->dtah_size * sizeof (dt_ahashent_t *);
 493
 494                 if ((hash->dtah_hash = malloc(size)) == NULL)
 495                         return (dt_set_errno(dtp, EDT_NOMEM));
 496
 497                 bzero(hash->dtah_hash, size);
 498         }
 499
 500         for (offs = 0; offs < buf->dtbd_size; ) {
 501                 /*
 502                  * We're guaranteed to have an ID.
 503                  */
 504                 id = *((dtrace_epid_t *)((uintptr_t)buf->dtbd_data +
 505                     (uintptr_t)offs));
 506
 507                 if (id == DTRACE_AGGIDNONE) {
 508                         /*
 509                          * This is filler to assure proper alignment of the
 510                          * next record; we simply ignore it.
 511                          */
 512                         offs += sizeof (id);
 513                         continue;
 514                 }
 515
 516                 if ((rval = dt_aggid_lookup(dtp, id, &agg)) != 0)
 517                         return (rval);
 518
 519                 addr = buf->dtbd_data + offs;
 520                 size = agg->dtagd_size;
 521                 hashval = 0;
 522
 523                 for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
 524                         rec = &agg->dtagd_rec[j];
 525                         roffs = rec->dtrd_offset;
 526
 527                         switch (rec->dtrd_action) {
 528                         case DTRACEACT_USYM:
 529                                 dt_aggregate_usym(dtp,
 530                                     /* LINTED - alignment */
 531                                     (uint64_t *)&addr[roffs]);
 532                                 break;
 533
 534                         case DTRACEACT_UMOD:
 535                                 dt_aggregate_umod(dtp,
 536                                     /* LINTED - alignment */
 537                                     (uint64_t *)&addr[roffs]);
 538                                 break;
 539
 540                         case DTRACEACT_SYM:
 541                                 /* LINTED - alignment */
 542                                 dt_aggregate_sym(dtp, (uint64_t *)&addr[roffs]);
 543                                 break;
 544
 545                         case DTRACEACT_MOD:
 546                                 /* LINTED - alignment */
 547                                 dt_aggregate_mod(dtp, (uint64_t *)&addr[roffs]);
 548                                 break;
 549
 550                         default:
 551                                 break;
 552                         }
 553
 554                         for (i = 0; i < rec->dtrd_size; i++)
 555                                 hashval += addr[roffs + i];
 556                 }
 557
 558                 ndx = hashval % hash->dtah_size;
 559
 560                 for (h = hash->dtah_hash[ndx]; h != NULL; h = h->dtahe_next) {
 561                         if (h->dtahe_hashval != hashval)
 562                                 continue;
 563
 564                         if (h->dtahe_size != size)
 565                                 continue;
 566
 567                         aggdata = &h->dtahe_data;
 568                         data = aggdata->dtada_data;
 569
 570                         for (j = 0; j < agg->dtagd_nrecs - 1; j++) {
 571                                 rec = &agg->dtagd_rec[j];
 572                                 roffs = rec->dtrd_offset;
 573
 574                                 for (i = 0; i < rec->dtrd_size; i++)
 575                                         if (addr[roffs + i] != data[roffs + i])
 576                                                 goto hashnext;
 577                         }
 578
 579                         /*
 580                          * We found it.  Now we need to apply the aggregating
 581                          * action on the data here.
 582                          */
 583                         rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
 584                         roffs = rec->dtrd_offset;
 585                         /* LINTED - alignment */
 586                         h->dtahe_aggregate((int64_t *)&data[roffs],
 587                             /* LINTED - alignment */
 588                             (int64_t *)&addr[roffs], rec->dtrd_size);
 589
 590                         /*
 591                          * If we're keeping per CPU data, apply the aggregating
 592                          * action there as well.
 593                          */
 594                         if (aggdata->dtada_percpu != NULL) {
 595                                 data = aggdata->dtada_percpu[cpu];
 596
 597                                 /* LINTED - alignment */
 598                                 h->dtahe_aggregate((int64_t *)data,
 599                                     /* LINTED - alignment */
 600                                     (int64_t *)&addr[roffs], rec->dtrd_size);
 601                         }
 602
 603                         goto bufnext;
 604 hashnext:
 605                         continue;
 606                 }
 607
 608                 /*
 609                  * If we're here, we couldn't find an entry for this record.
 610                  */
 611                 if ((h = malloc(sizeof (dt_ahashent_t))) == NULL)
 612                         return (dt_set_errno(dtp, EDT_NOMEM));
 613                 bzero(h, sizeof (dt_ahashent_t));
 614                 aggdata = &h->dtahe_data;
 615
 616                 if ((aggdata->dtada_data = malloc(size)) == NULL) {
 617                         free(h);
 618                         return (dt_set_errno(dtp, EDT_NOMEM));
 619                 }
 620
 621                 bcopy(addr, aggdata->dtada_data, size);
 622                 aggdata->dtada_size = size;
 623                 aggdata->dtada_desc = agg;
 624                 aggdata->dtada_handle = dtp;
 625                 (void) dt_epid_lookup(dtp, agg->dtagd_epid,
 626                     &aggdata->dtada_edesc, &aggdata->dtada_pdesc);
 627                 aggdata->dtada_normal = 1;
 628
 629                 h->dtahe_hashval = hashval;
 630                 h->dtahe_size = size;
 631                 (void) dt_aggregate_aggvarid(h);
 632
 633                 rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
 634
 635                 if (flags & DTRACE_A_PERCPU) {
 636                         int max_cpus = agp->dtat_maxcpu;
 637                         caddr_t *percpu = malloc(max_cpus * sizeof (caddr_t));
 638
 639                         if (percpu == NULL) {
 640                                 free(aggdata->dtada_data);
 641                                 free(h);
 642                                 return (dt_set_errno(dtp, EDT_NOMEM));
 643                         }
 644
 645                         for (j = 0; j < max_cpus; j++) {
 646                                 percpu[j] = malloc(rec->dtrd_size);
 647
 648                                 if (percpu[j] == NULL) {
 649                                         while (--j >= 0)
 650                                                 free(percpu[j]);
 651
 652                                         free(aggdata->dtada_data);
 653                                         free(h);
 654                                         return (dt_set_errno(dtp, EDT_NOMEM));
 655                                 }
 656
 657                                 if (j == cpu) {
 658                                         bcopy(&addr[rec->dtrd_offset],
 659                                             percpu[j], rec->dtrd_size);
 660                                 } else {
 661                                         bzero(percpu[j], rec->dtrd_size);
 662                                 }
 663                         }
 664
 665                         aggdata->dtada_percpu = percpu;
 666                 }
 667
 668                 switch (rec->dtrd_action) {
 669                 case DTRACEAGG_MIN:
 670                         h->dtahe_aggregate = dt_aggregate_min;
 671                         break;
 672
 673                 case DTRACEAGG_MAX:
 674                         h->dtahe_aggregate = dt_aggregate_max;
 675                         break;
 676
 677                 case DTRACEAGG_LQUANTIZE:
 678                         h->dtahe_aggregate = dt_aggregate_lquantize;
 679                         break;
 680
 681                 case DTRACEAGG_LLQUANTIZE:
 682                         h->dtahe_aggregate = dt_aggregate_llquantize;
 683                         break;
 684
 685                 case DTRACEAGG_COUNT:
 686                 case DTRACEAGG_SUM:
 687                 case DTRACEAGG_AVG:
 688                 case DTRACEAGG_STDDEV:
 689                 case DTRACEAGG_QUANTIZE:
 690                         h->dtahe_aggregate = dt_aggregate_count;
 691                         break;
 692
 693                 default:
 694                         return (dt_set_errno(dtp, EDT_BADAGG));
 695                 }
 696
 697                 if (hash->dtah_hash[ndx] != NULL)
 698                         hash->dtah_hash[ndx]->dtahe_prev = h;
 699
 700                 h->dtahe_next = hash->dtah_hash[ndx];
 701                 hash->dtah_hash[ndx] = h;
 702
 703                 if (hash->dtah_all != NULL)
 704                         hash->dtah_all->dtahe_prevall = h;
 705
 706                 h->dtahe_nextall = hash->dtah_all;
 707                 hash->dtah_all = h;
 708 bufnext:
 709                 offs += agg->dtagd_size;
 710         }
 711
 712         return (0);
 713 }
 714
 715 int
 716 dtrace_aggregate_snap(dtrace_hdl_t *dtp)
 717 {
 718         int i, rval;
 719         dt_aggregate_t *agp = &dtp->dt_aggregate;
 720         hrtime_t now = gethrtime();
 721         dtrace_optval_t interval = dtp->dt_options[DTRACEOPT_AGGRATE];
 722
 723         if (dtp->dt_lastagg != 0) {
 724                 if (now - dtp->dt_lastagg < interval)
 725                         return (0);
 726
 727                 dtp->dt_lastagg += interval;
 728         } else {
 729                 dtp->dt_lastagg = now;
 730         }
 731
 732         if (!dtp->dt_active)
 733                 return (dt_set_errno(dtp, EINVAL));
 734
 735         if (agp->dtat_buf.dtbd_size == 0)
 736                 return (0);
 737
 738         for (i = 0; i < agp->dtat_ncpus; i++) {
 739                 if ((rval = dt_aggregate_snap_cpu(dtp, agp->dtat_cpus[i])))
 740                         return (rval);
 741         }
 742
 743         return (0);
 744 }
 745
 746 static int
 747 dt_aggregate_hashcmp(const void *lhs, const void *rhs)
 748 {
 749         dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
 750         dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
 751         dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
 752         dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
 753
 754         if (lagg->dtagd_nrecs < ragg->dtagd_nrecs)
 755                 return (DT_LESSTHAN);
 756
 757         if (lagg->dtagd_nrecs > ragg->dtagd_nrecs)
 758                 return (DT_GREATERTHAN);
 759
 760         return (0);
 761 }
 762
 763 static int
 764 dt_aggregate_varcmp(const void *lhs, const void *rhs)
 765 {
 766         dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
 767         dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
 768         dtrace_aggvarid_t lid, rid;
 769
 770         lid = dt_aggregate_aggvarid(lh);
 771         rid = dt_aggregate_aggvarid(rh);
 772
 773         if (lid < rid)
 774                 return (DT_LESSTHAN);
 775
 776         if (lid > rid)
 777                 return (DT_GREATERTHAN);
 778
 779         return (0);
 780 }
 781
 782 static int
 783 dt_aggregate_keycmp(const void *lhs, const void *rhs)
 784 {
 785         dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
 786         dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
 787         dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
 788         dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
 789         dtrace_recdesc_t *lrec, *rrec;
 790         char *ldata, *rdata;
 791         int rval, i, j, keypos, nrecs;
 792
 793         if ((rval = dt_aggregate_hashcmp(lhs, rhs)) != 0)
 794                 return (rval);
 795
 796         nrecs = lagg->dtagd_nrecs - 1;
 797         assert(nrecs == ragg->dtagd_nrecs - 1);
 798
 799         keypos = dt_keypos + 1 >= nrecs ? 0 : dt_keypos;
 800
 801         for (i = 1; i < nrecs; i++) {
 802                 uint64_t lval, rval;
 803                 int ndx = i + keypos;
 804
 805                 if (ndx >= nrecs)
 806                         ndx = ndx - nrecs + 1;
 807
 808                 lrec = &lagg->dtagd_rec[ndx];
 809                 rrec = &ragg->dtagd_rec[ndx];
 810
 811                 ldata = lh->dtahe_data.dtada_data + lrec->dtrd_offset;
 812                 rdata = rh->dtahe_data.dtada_data + rrec->dtrd_offset;
 813
 814                 if (lrec->dtrd_size < rrec->dtrd_size)
 815                         return (DT_LESSTHAN);
 816
 817                 if (lrec->dtrd_size > rrec->dtrd_size)
 818                         return (DT_GREATERTHAN);
 819
 820                 switch (lrec->dtrd_size) {
 821                 case sizeof (uint64_t):
 822                         /* LINTED - alignment */
 823                         lval = *((uint64_t *)ldata);
 824                         /* LINTED - alignment */
 825                         rval = *((uint64_t *)rdata);
 826                         break;
 827
 828                 case sizeof (uint32_t):
 829                         /* LINTED - alignment */
 830                         lval = *((uint32_t *)ldata);
 831                         /* LINTED - alignment */
 832                         rval = *((uint32_t *)rdata);
 833                         break;
 834
 835                 case sizeof (uint16_t):
 836                         /* LINTED - alignment */
 837                         lval = *((uint16_t *)ldata);
 838                         /* LINTED - alignment */
 839                         rval = *((uint16_t *)rdata);
 840                         break;
 841
 842                 case sizeof (uint8_t):
 843                         lval = *((uint8_t *)ldata);
 844                         rval = *((uint8_t *)rdata);
 845                         break;
 846
 847                 default:
 848                         switch (lrec->dtrd_action) {
 849                         case DTRACEACT_UMOD:
 850                         case DTRACEACT_UADDR:
 851                         case DTRACEACT_USYM:
 852                                 for (j = 0; j < 2; j++) {
 853                                         /* LINTED - alignment */
 854                                         lval = ((uint64_t *)ldata)[j];
 855                                         /* LINTED - alignment */
 856                                         rval = ((uint64_t *)rdata)[j];
 857
 858                                         if (lval < rval)
 859                                                 return (DT_LESSTHAN);
 860
 861                                         if (lval > rval)
 862                                                 return (DT_GREATERTHAN);
 863                                 }
 864
 865                                 break;
 866
 867                         default:
 868                                 for (j = 0; j < lrec->dtrd_size; j++) {
 869                                         lval = ((uint8_t *)ldata)[j];
 870                                         rval = ((uint8_t *)rdata)[j];
 871
 872                                         if (lval < rval)
 873                                                 return (DT_LESSTHAN);
 874
 875                                         if (lval > rval)
 876                                                 return (DT_GREATERTHAN);
 877                                 }
 878                         }
 879
 880                         continue;
 881                 }
 882
 883                 if (lval < rval)
 884                         return (DT_LESSTHAN);
 885
 886                 if (lval > rval)
 887                         return (DT_GREATERTHAN);
 888         }
 889
 890         return (0);
 891 }
 892
 893 static int
 894 dt_aggregate_valcmp(const void *lhs, const void *rhs)
 895 {
 896         dt_ahashent_t *lh = *((dt_ahashent_t **)lhs);
 897         dt_ahashent_t *rh = *((dt_ahashent_t **)rhs);
 898         dtrace_aggdesc_t *lagg = lh->dtahe_data.dtada_desc;
 899         dtrace_aggdesc_t *ragg = rh->dtahe_data.dtada_desc;
 900         caddr_t ldata = lh->dtahe_data.dtada_data;
 901         caddr_t rdata = rh->dtahe_data.dtada_data;
 902         dtrace_recdesc_t *lrec, *rrec;
 903         int64_t *laddr, *raddr;
 904         int rval;
 905
 906         assert(lagg->dtagd_nrecs == ragg->dtagd_nrecs);
 907
 908         lrec = &lagg->dtagd_rec[lagg->dtagd_nrecs - 1];
 909         rrec = &ragg->dtagd_rec[ragg->dtagd_nrecs - 1];
 910
 911         assert(lrec->dtrd_action == rrec->dtrd_action);
 912
 913         laddr = (int64_t *)(uintptr_t)(ldata + lrec->dtrd_offset);
 914         raddr = (int64_t *)(uintptr_t)(rdata + rrec->dtrd_offset);
 915
 916         switch (lrec->dtrd_action) {
 917         case DTRACEAGG_AVG:
 918                 rval = dt_aggregate_averagecmp(laddr, raddr);
 919                 break;
 920
 921         case DTRACEAGG_STDDEV:
 922                 rval = dt_aggregate_stddevcmp(laddr, raddr);
 923                 break;
 924
 925         case DTRACEAGG_QUANTIZE:
 926                 rval = dt_aggregate_quantizedcmp(laddr, raddr);
 927                 break;
 928
 929         case DTRACEAGG_LQUANTIZE:
 930                 rval = dt_aggregate_lquantizedcmp(laddr, raddr);
 931                 break;
 932
 933         case DTRACEAGG_LLQUANTIZE:
 934                 rval = dt_aggregate_llquantizedcmp(laddr, raddr);
 935                 break;
 936
 937         case DTRACEAGG_COUNT:
 938         case DTRACEAGG_SUM:
 939         case DTRACEAGG_MIN:
 940         case DTRACEAGG_MAX:
 941                 rval = dt_aggregate_countcmp(laddr, raddr);
 942                 break;
 943
 944         default:
 945                 assert(0);
 946         }
 947
 948         return (rval);
 949 }
 950
 951 static int
 952 dt_aggregate_valkeycmp(const void *lhs, const void *rhs)
 953 {
 954         int rval;
 955
 956         if ((rval = dt_aggregate_valcmp(lhs, rhs)) != 0)
 957                 return (rval);
 958
 959         /*
 960          * If we're here, the values for the two aggregation elements are
 961          * equal.  We already know that the key layout is the same for the two
 962          * elements; we must now compare the keys themselves as a tie-breaker.
 963          */
 964         return (dt_aggregate_keycmp(lhs, rhs));
 965 }
 966
 967 static int
 968 dt_aggregate_keyvarcmp(const void *lhs, const void *rhs)
 969 {
 970         int rval;
 971
 972         if ((rval = dt_aggregate_keycmp(lhs, rhs)) != 0)
 973                 return (rval);
 974
 975         return (dt_aggregate_varcmp(lhs, rhs));
 976 }
 977
 978 static int
 979 dt_aggregate_varkeycmp(const void *lhs, const void *rhs)
 980 {
 981         int rval;
 982
 983         if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
 984                 return (rval);
 985
 986         return (dt_aggregate_keycmp(lhs, rhs));
 987 }
 988
 989 static int
 990 dt_aggregate_valvarcmp(const void *lhs, const void *rhs)
 991 {
 992         int rval;
 993
 994         if ((rval = dt_aggregate_valkeycmp(lhs, rhs)) != 0)
 995                 return (rval);
 996
 997         return (dt_aggregate_varcmp(lhs, rhs));
 998 }
 999
1000 static int
1001 dt_aggregate_varvalcmp(const void *lhs, const void *rhs)
1002 {
1003         int rval;
1004
1005         if ((rval = dt_aggregate_varcmp(lhs, rhs)) != 0)
1006                 return (rval);
1007
1008         return (dt_aggregate_valkeycmp(lhs, rhs));
1009 }
1010
1011 static int
1012 dt_aggregate_keyvarrevcmp(const void *lhs, const void *rhs)
1013 {
1014         return (dt_aggregate_keyvarcmp(rhs, lhs));
1015 }
1016
1017 static int
1018 dt_aggregate_varkeyrevcmp(const void *lhs, const void *rhs)
1019 {
1020         return (dt_aggregate_varkeycmp(rhs, lhs));
1021 }
1022
1023 static int
1024 dt_aggregate_valvarrevcmp(const void *lhs, const void *rhs)
1025 {
1026         return (dt_aggregate_valvarcmp(rhs, lhs));
1027 }
1028
1029 static int
1030 dt_aggregate_varvalrevcmp(const void *lhs, const void *rhs)
1031 {
1032         return (dt_aggregate_varvalcmp(rhs, lhs));
1033 }
1034
1035 static int
1036 dt_aggregate_bundlecmp(const void *lhs, const void *rhs)
1037 {
1038         dt_ahashent_t **lh = *((dt_ahashent_t ***)lhs);
1039         dt_ahashent_t **rh = *((dt_ahashent_t ***)rhs);
1040         int i, rval;
1041
1042         if (dt_keysort) {
1043                 /*
1044                  * If we're sorting on keys, we need to scan until we find the
1045                  * last entry -- that's the representative key.  (The order of
1046                  * the bundle is values followed by key to accommodate the
1047                  * default behavior of sorting by value.)  If the keys are
1048                  * equal, we'll fall into the value comparison loop, below.
1049                  */
1050                 for (i = 0; lh[i + 1] != NULL; i++)
1051                         continue;
1052
1053                 assert(i != 0);
1054                 assert(rh[i + 1] == NULL);
1055
1056                 if ((rval = dt_aggregate_keycmp(&lh[i], &rh[i])) != 0)
1057                         return (rval);
1058         }
1059
1060         for (i = 0; ; i++) {
1061                 if (lh[i + 1] == NULL) {
1062                         /*
1063                          * All of the values are equal; if we're sorting on
1064                          * keys, then we're only here because the keys were
1065                          * found to be equal and these records are therefore
1066                          * equal.  If we're not sorting on keys, we'll use the
1067                          * key comparison from the representative key as the
1068                          * tie-breaker.
1069                          */
1070                         if (dt_keysort)
1071                                 return (0);
1072
1073                         assert(i != 0);
1074                         assert(rh[i + 1] == NULL);
1075                         return (dt_aggregate_keycmp(&lh[i], &rh[i]));
1076                 } else {
1077                         if ((rval = dt_aggregate_valcmp(&lh[i], &rh[i])) != 0)
1078                                 return (rval);
1079                 }
1080         }
1081 }
1082
1083 int
1084 dt_aggregate_go(dtrace_hdl_t *dtp)
1085 {
1086         dt_aggregate_t *agp = &dtp->dt_aggregate;
1087         dtrace_optval_t size, cpu;
1088         dtrace_bufdesc_t *buf = &agp->dtat_buf;
1089         int rval, i;
1090
1091         assert(agp->dtat_maxcpu == 0);
1092         assert(agp->dtat_ncpu == 0);
1093         assert(agp->dtat_cpus == NULL);
1094
1095         agp->dtat_maxcpu = dt_sysconf(dtp, _SC_CPUID_MAX) + 1;
1096         agp->dtat_ncpu = dt_sysconf(dtp, _SC_NPROCESSORS_MAX);
1097         agp->dtat_cpus = malloc(agp->dtat_ncpu * sizeof (processorid_t));
1098
1099         if (agp->dtat_cpus == NULL)
1100                 return (dt_set_errno(dtp, EDT_NOMEM));
1101
1102         /*
1103          * Use the aggregation buffer size as reloaded from the kernel.
1104          */
1105         size = dtp->dt_options[DTRACEOPT_AGGSIZE];
1106
1107         rval = dtrace_getopt(dtp, "aggsize", &size);
1108         assert(rval == 0);
1109
1110         if (size == 0 || size == DTRACEOPT_UNSET)
1111                 return (0);
1112
1113         buf = &agp->dtat_buf;
1114         buf->dtbd_size = size;
1115
1116         if ((buf->dtbd_data = malloc(buf->dtbd_size)) == NULL)
1117                 return (dt_set_errno(dtp, EDT_NOMEM));
1118
1119         /*
1120          * Now query for the CPUs enabled.
1121          */
1122         rval = dtrace_getopt(dtp, "cpu", &cpu);
1123         assert(rval == 0 && cpu != DTRACEOPT_UNSET);
1124
1125         if (cpu != DTRACE_CPUALL) {
1126                 assert(cpu < agp->dtat_ncpu);
1127                 agp->dtat_cpus[agp->dtat_ncpus++] = (processorid_t)cpu;
1128
1129                 return (0);
1130         }
1131
1132         agp->dtat_ncpus = 0;
1133         for (i = 0; i < agp->dtat_maxcpu; i++) {
1134                 if (dt_status(dtp, i) == -1)
1135                         continue;
1136
1137                 agp->dtat_cpus[agp->dtat_ncpus++] = i;
1138         }
1139
1140         return (0);
1141 }
1142
1143 static int
1144 dt_aggwalk_rval(dtrace_hdl_t *dtp, dt_ahashent_t *h, int rval)
1145 {
1146         dt_aggregate_t *agp = &dtp->dt_aggregate;
1147         dtrace_aggdata_t *data;
1148         dtrace_aggdesc_t *aggdesc;
1149         dtrace_recdesc_t *rec;
1150         int i;
1151
1152         switch (rval) {
1153         case DTRACE_AGGWALK_NEXT:
1154                 break;
1155
1156         case DTRACE_AGGWALK_CLEAR: {
1157                 uint32_t size, offs = 0;
1158
1159                 aggdesc = h->dtahe_data.dtada_desc;
1160                 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1161                 size = rec->dtrd_size;
1162                 data = &h->dtahe_data;
1163
1164                 if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
1165                         offs = sizeof (uint64_t);
1166                         size -= sizeof (uint64_t);
1167                 }
1168
1169                 bzero(&data->dtada_data[rec->dtrd_offset] + offs, size);
1170
1171                 if (data->dtada_percpu == NULL)
1172                         break;
1173
1174                 for (i = 0; i < dtp->dt_aggregate.dtat_maxcpu; i++)
1175                         bzero(data->dtada_percpu[i] + offs, size);
1176                 break;
1177         }
1178
1179         case DTRACE_AGGWALK_ERROR:
1180                 /*
1181                  * We assume that errno is already set in this case.
1182                  */
1183                 return (dt_set_errno(dtp, errno));
1184
1185         case DTRACE_AGGWALK_ABORT:
1186                 return (dt_set_errno(dtp, EDT_DIRABORT));
1187
1188         case DTRACE_AGGWALK_DENORMALIZE:
1189                 h->dtahe_data.dtada_normal = 1;
1190                 return (0);
1191
1192         case DTRACE_AGGWALK_NORMALIZE:
1193                 if (h->dtahe_data.dtada_normal == 0) {
1194                         h->dtahe_data.dtada_normal = 1;
1195                         return (dt_set_errno(dtp, EDT_BADRVAL));
1196                 }
1197
1198                 return (0);
1199
1200         case DTRACE_AGGWALK_REMOVE: {
1201                 dtrace_aggdata_t *aggdata = &h->dtahe_data;
1202                 int max_cpus = agp->dtat_maxcpu;
1203
1204                 /*
1205                  * First, remove this hash entry from its hash chain.
1206                  */
1207                 if (h->dtahe_prev != NULL) {
1208                         h->dtahe_prev->dtahe_next = h->dtahe_next;
1209                 } else {
1210                         dt_ahash_t *hash = &agp->dtat_hash;
1211                         size_t ndx = h->dtahe_hashval % hash->dtah_size;
1212
1213                         assert(hash->dtah_hash[ndx] == h);
1214                         hash->dtah_hash[ndx] = h->dtahe_next;
1215                 }
1216
1217                 if (h->dtahe_next != NULL)
1218                         h->dtahe_next->dtahe_prev = h->dtahe_prev;
1219
1220                 /*
1221                  * Now remove it from the list of all hash entries.
1222                  */
1223                 if (h->dtahe_prevall != NULL) {
1224                         h->dtahe_prevall->dtahe_nextall = h->dtahe_nextall;
1225                 } else {
1226                         dt_ahash_t *hash = &agp->dtat_hash;
1227
1228                         assert(hash->dtah_all == h);
1229                         hash->dtah_all = h->dtahe_nextall;
1230                 }
1231
1232                 if (h->dtahe_nextall != NULL)
1233                         h->dtahe_nextall->dtahe_prevall = h->dtahe_prevall;
1234
1235                 /*
1236                  * We're unlinked.  We can safely destroy the data.
1237                  */
1238                 if (aggdata->dtada_percpu != NULL) {
1239                         for (i = 0; i < max_cpus; i++)
1240                                 free(aggdata->dtada_percpu[i]);
1241                         free(aggdata->dtada_percpu);
1242                 }
1243
1244                 free(aggdata->dtada_data);
1245                 free(h);
1246
1247                 return (0);
1248         }
1249
1250         default:
1251                 return (dt_set_errno(dtp, EDT_BADRVAL));
1252         }
1253
1254         return (0);
1255 }
1256
1257 void
1258 dt_aggregate_qsort(dtrace_hdl_t *dtp, void *base, size_t nel, size_t width,
1259     int (*compar)(const void *, const void *))
1260 {
1261         int rev = dt_revsort, key = dt_keysort, keypos = dt_keypos;
1262         dtrace_optval_t keyposopt = dtp->dt_options[DTRACEOPT_AGGSORTKEYPOS];
1263
1264         dt_revsort = (dtp->dt_options[DTRACEOPT_AGGSORTREV] != DTRACEOPT_UNSET);
1265         dt_keysort = (dtp->dt_options[DTRACEOPT_AGGSORTKEY] != DTRACEOPT_UNSET);
1266
1267         if (keyposopt != DTRACEOPT_UNSET && keyposopt <= INT_MAX) {
1268                 dt_keypos = (int)keyposopt;
1269         } else {
1270                 dt_keypos = 0;
1271         }
1272
1273         if (compar == NULL) {
1274                 if (!dt_keysort) {
1275                         compar = dt_aggregate_varvalcmp;
1276                 } else {
1277                         compar = dt_aggregate_varkeycmp;
1278                 }
1279         }
1280
1281         qsort(base, nel, width, compar);
1282
1283         dt_revsort = rev;
1284         dt_keysort = key;
1285         dt_keypos = keypos;
1286 }
1287
1288 int
1289 dtrace_aggregate_walk(dtrace_hdl_t *dtp, dtrace_aggregate_f *func, void *arg)
1290 {
1291         dt_ahashent_t *h, *next;
1292         dt_ahash_t *hash = &dtp->dt_aggregate.dtat_hash;
1293
1294         for (h = hash->dtah_all; h != NULL; h = next) {
1295                 /*
1296                  * dt_aggwalk_rval() can potentially remove the current hash
1297                  * entry; we need to load the next hash entry before calling
1298                  * into it.
1299                  */
1300                 next = h->dtahe_nextall;
1301
1302                 if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
1303                         return (-1);
1304         }
1305
1306         return (0);
1307 }
1308
1309 static int
1310 dt_aggregate_total(dtrace_hdl_t *dtp, boolean_t clear)
1311 {
1312         dt_ahashent_t *h;
1313         dtrace_aggdata_t **total;
1314         dtrace_aggid_t max = DTRACE_AGGVARIDNONE, id;
1315         dt_aggregate_t *agp = &dtp->dt_aggregate;
1316         dt_ahash_t *hash = &agp->dtat_hash;
1317         uint32_t tflags;
1318
1319         tflags = DTRACE_A_TOTAL | DTRACE_A_HASNEGATIVES | DTRACE_A_HASPOSITIVES;
1320
1321         /*
1322          * If we need to deliver per-aggregation totals, we're going to take
1323          * three passes over the aggregate:  one to clear everything out and
1324          * determine our maximum aggregation ID, one to actually total
1325          * everything up, and a final pass to assign the totals to the
1326          * individual elements.
1327          */
1328         for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1329                 dtrace_aggdata_t *aggdata = &h->dtahe_data;
1330
1331                 if ((id = dt_aggregate_aggvarid(h)) > max)
1332                         max = id;
1333
1334                 aggdata->dtada_total = 0;
1335                 aggdata->dtada_flags &= ~tflags;
1336         }
1337
1338         if (clear || max == DTRACE_AGGVARIDNONE)
1339                 return (0);
1340
1341         total = dt_zalloc(dtp, (max + 1) * sizeof (dtrace_aggdata_t *));
1342
1343         if (total == NULL)
1344                 return (-1);
1345
1346         for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1347                 dtrace_aggdata_t *aggdata = &h->dtahe_data;
1348                 dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1349                 dtrace_recdesc_t *rec;
1350                 caddr_t data;
1351                 int64_t val, *addr;
1352
1353                 rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
1354                 data = aggdata->dtada_data;
1355                 addr = (int64_t *)(uintptr_t)(data + rec->dtrd_offset);
1356
1357                 switch (rec->dtrd_action) {
1358                 case DTRACEAGG_STDDEV:
1359                         val = dt_stddev((uint64_t *)addr, 1);
1360                         break;
1361
1362                 case DTRACEAGG_SUM:
1363                 case DTRACEAGG_COUNT:
1364                         val = *addr;
1365                         break;
1366
1367                 case DTRACEAGG_AVG:
1368                         val = addr[0] ? (addr[1] / addr[0]) : 0;
1369                         break;
1370
1371                 default:
1372                         continue;
1373                 }
1374
1375                 if (total[agg->dtagd_varid] == NULL) {
1376                         total[agg->dtagd_varid] = aggdata;
1377                         aggdata->dtada_flags |= DTRACE_A_TOTAL;
1378                 } else {
1379                         aggdata = total[agg->dtagd_varid];
1380                 }
1381
1382                 if (val > 0)
1383                         aggdata->dtada_flags |= DTRACE_A_HASPOSITIVES;
1384
1385                 if (val < 0) {
1386                         aggdata->dtada_flags |= DTRACE_A_HASNEGATIVES;
1387                         val = -val;
1388                 }
1389
1390                 if (dtp->dt_options[DTRACEOPT_AGGZOOM] != DTRACEOPT_UNSET) {
1391                         val = (int64_t)((long double)val *
1392                             (1 / DTRACE_AGGZOOM_MAX));
1393
1394                         if (val > aggdata->dtada_total)
1395                                 aggdata->dtada_total = val;
1396                 } else {
1397                         aggdata->dtada_total += val;
1398                 }
1399         }
1400
1401         /*
1402          * And now one final pass to set everyone's total.
1403          */
1404         for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1405                 dtrace_aggdata_t *aggdata = &h->dtahe_data, *t;
1406                 dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1407
1408                 if ((t = total[agg->dtagd_varid]) == NULL || aggdata == t)
1409                         continue;
1410
1411                 aggdata->dtada_total = t->dtada_total;
1412                 aggdata->dtada_flags |= (t->dtada_flags & tflags);
1413         }
1414
1415         dt_free(dtp, total);
1416
1417         return (0);
1418 }
1419
1420 static int
1421 dt_aggregate_minmaxbin(dtrace_hdl_t *dtp, boolean_t clear)
1422 {
1423         dt_ahashent_t *h;
1424         dtrace_aggdata_t **minmax;
1425         dtrace_aggid_t max = DTRACE_AGGVARIDNONE, id;
1426         dt_aggregate_t *agp = &dtp->dt_aggregate;
1427         dt_ahash_t *hash = &agp->dtat_hash;
1428
1429         for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1430                 dtrace_aggdata_t *aggdata = &h->dtahe_data;
1431
1432                 if ((id = dt_aggregate_aggvarid(h)) > max)
1433                         max = id;
1434
1435                 aggdata->dtada_minbin = 0;
1436                 aggdata->dtada_maxbin = 0;
1437                 aggdata->dtada_flags &= ~DTRACE_A_MINMAXBIN;
1438         }
1439
1440         if (clear || max == DTRACE_AGGVARIDNONE)
1441                 return (0);
1442
1443         minmax = dt_zalloc(dtp, (max + 1) * sizeof (dtrace_aggdata_t *));
1444
1445         if (minmax == NULL)
1446                 return (-1);
1447
1448         for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1449                 dtrace_aggdata_t *aggdata = &h->dtahe_data;
1450                 dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1451                 dtrace_recdesc_t *rec;
1452                 caddr_t data;
1453                 int64_t *addr;
1454                 int minbin = -1, maxbin = -1, i;
1455                 int start = 0, size;
1456
1457                 rec = &agg->dtagd_rec[agg->dtagd_nrecs - 1];
1458                 size = rec->dtrd_size / sizeof (int64_t);
1459                 data = aggdata->dtada_data;
1460                 addr = (int64_t *)(uintptr_t)(data + rec->dtrd_offset);
1461
1462                 switch (rec->dtrd_action) {
1463                 case DTRACEAGG_LQUANTIZE:
1464                         /*
1465                          * For lquantize(), we always display the entire range
1466                          * of the aggregation when aggpack is set.
1467                          */
1468                         start = 1;
1469                         minbin = start;
1470                         maxbin = size - 1 - start;
1471                         break;
1472
1473                 case DTRACEAGG_QUANTIZE:
1474                         for (i = start; i < size; i++) {
1475                                 if (!addr[i])
1476                                         continue;
1477
1478                                 if (minbin == -1)
1479                                         minbin = i - start;
1480
1481                                 maxbin = i - start;
1482                         }
1483
1484                         if (minbin == -1) {
1485                                 /*
1486                                  * If we have no data (e.g., due to a clear()
1487                                  * or negative increments), we'll use the
1488                                  * zero bucket as both our min and max.
1489                                  */
1490                                 minbin = maxbin = DTRACE_QUANTIZE_ZEROBUCKET;
1491                         }
1492
1493                         break;
1494
1495                 default:
1496                         continue;
1497                 }
1498
1499                 if (minmax[agg->dtagd_varid] == NULL) {
1500                         minmax[agg->dtagd_varid] = aggdata;
1501                         aggdata->dtada_flags |= DTRACE_A_MINMAXBIN;
1502                         aggdata->dtada_minbin = minbin;
1503                         aggdata->dtada_maxbin = maxbin;
1504                         continue;
1505                 }
1506
1507                 if (minbin < minmax[agg->dtagd_varid]->dtada_minbin)
1508                         minmax[agg->dtagd_varid]->dtada_minbin = minbin;
1509
1510                 if (maxbin > minmax[agg->dtagd_varid]->dtada_maxbin)
1511                         minmax[agg->dtagd_varid]->dtada_maxbin = maxbin;
1512         }
1513
1514         /*
1515          * And now one final pass to set everyone's minbin and maxbin.
1516          */
1517         for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1518                 dtrace_aggdata_t *aggdata = &h->dtahe_data, *mm;
1519                 dtrace_aggdesc_t *agg = aggdata->dtada_desc;
1520
1521                 if ((mm = minmax[agg->dtagd_varid]) == NULL || aggdata == mm)
1522                         continue;
1523
1524                 aggdata->dtada_minbin = mm->dtada_minbin;
1525                 aggdata->dtada_maxbin = mm->dtada_maxbin;
1526                 aggdata->dtada_flags |= DTRACE_A_MINMAXBIN;
1527         }
1528
1529         dt_free(dtp, minmax);
1530
1531         return (0);
1532 }
1533
1534 static int
1535 dt_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1536     dtrace_aggregate_f *func, void *arg,
1537     int (*sfunc)(const void *, const void *))
1538 {
1539         dt_aggregate_t *agp = &dtp->dt_aggregate;
1540         dt_ahashent_t *h, **sorted;
1541         dt_ahash_t *hash = &agp->dtat_hash;
1542         size_t i, nentries = 0;
1543         int rval = -1;
1544
1545         agp->dtat_flags &= ~(DTRACE_A_TOTAL | DTRACE_A_MINMAXBIN);
1546
1547         if (dtp->dt_options[DTRACEOPT_AGGHIST] != DTRACEOPT_UNSET) {
1548                 agp->dtat_flags |= DTRACE_A_TOTAL;
1549
1550                 if (dt_aggregate_total(dtp, B_FALSE) != 0)
1551                         return (-1);
1552         }
1553
1554         if (dtp->dt_options[DTRACEOPT_AGGPACK] != DTRACEOPT_UNSET) {
1555                 agp->dtat_flags |= DTRACE_A_MINMAXBIN;
1556
1557                 if (dt_aggregate_minmaxbin(dtp, B_FALSE) != 0)
1558                         return (-1);
1559         }
1560
1561         for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall)
1562                 nentries++;
1563
1564         sorted = dt_alloc(dtp, nentries * sizeof (dt_ahashent_t *));
1565
1566         if (sorted == NULL)
1567                 goto out;
1568
1569         for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall)
1570                 sorted[i++] = h;
1571
1572         (void) pthread_mutex_lock(&dt_qsort_lock);
1573
1574         if (sfunc == NULL) {
1575                 dt_aggregate_qsort(dtp, sorted, nentries,
1576                     sizeof (dt_ahashent_t *), NULL);
1577         } else {
1578                 /*
1579                  * If we've been explicitly passed a sorting function,
1580                  * we'll use that -- ignoring the values of the "aggsortrev",
1581                  * "aggsortkey" and "aggsortkeypos" options.
1582                  */
1583                 qsort(sorted, nentries, sizeof (dt_ahashent_t *), sfunc);
1584         }
1585
1586         (void) pthread_mutex_unlock(&dt_qsort_lock);
1587
1588         for (i = 0; i < nentries; i++) {
1589                 h = sorted[i];
1590
1591                 if (dt_aggwalk_rval(dtp, h, func(&h->dtahe_data, arg)) == -1)
1592                         goto out;
1593         }
1594
1595         rval = 0;
1596 out:
1597         if (agp->dtat_flags & DTRACE_A_TOTAL)
1598                 (void) dt_aggregate_total(dtp, B_TRUE);
1599
1600         if (agp->dtat_flags & DTRACE_A_MINMAXBIN)
1601                 (void) dt_aggregate_minmaxbin(dtp, B_TRUE);
1602
1603         dt_free(dtp, sorted);
1604         return (rval);
1605 }
1606
1607 int
1608 dtrace_aggregate_walk_sorted(dtrace_hdl_t *dtp,
1609     dtrace_aggregate_f *func, void *arg)
1610 {
1611         return (dt_aggregate_walk_sorted(dtp, func, arg, NULL));
1612 }
1613
1614 int
1615 dtrace_aggregate_walk_keysorted(dtrace_hdl_t *dtp,
1616     dtrace_aggregate_f *func, void *arg)
1617 {
1618         return (dt_aggregate_walk_sorted(dtp, func,
1619             arg, dt_aggregate_varkeycmp));
1620 }
1621
1622 int
1623 dtrace_aggregate_walk_valsorted(dtrace_hdl_t *dtp,
1624     dtrace_aggregate_f *func, void *arg)
1625 {
1626         return (dt_aggregate_walk_sorted(dtp, func,
1627             arg, dt_aggregate_varvalcmp));
1628 }
1629
1630 int
1631 dtrace_aggregate_walk_keyvarsorted(dtrace_hdl_t *dtp,
1632     dtrace_aggregate_f *func, void *arg)
1633 {
1634         return (dt_aggregate_walk_sorted(dtp, func,
1635             arg, dt_aggregate_keyvarcmp));
1636 }
1637
1638 int
1639 dtrace_aggregate_walk_valvarsorted(dtrace_hdl_t *dtp,
1640     dtrace_aggregate_f *func, void *arg)
1641 {
1642         return (dt_aggregate_walk_sorted(dtp, func,
1643             arg, dt_aggregate_valvarcmp));
1644 }
1645
1646 int
1647 dtrace_aggregate_walk_keyrevsorted(dtrace_hdl_t *dtp,
1648     dtrace_aggregate_f *func, void *arg)
1649 {
1650         return (dt_aggregate_walk_sorted(dtp, func,
1651             arg, dt_aggregate_varkeyrevcmp));
1652 }
1653
1654 int
1655 dtrace_aggregate_walk_valrevsorted(dtrace_hdl_t *dtp,
1656     dtrace_aggregate_f *func, void *arg)
1657 {
1658         return (dt_aggregate_walk_sorted(dtp, func,
1659             arg, dt_aggregate_varvalrevcmp));
1660 }
1661
1662 int
1663 dtrace_aggregate_walk_keyvarrevsorted(dtrace_hdl_t *dtp,
1664     dtrace_aggregate_f *func, void *arg)
1665 {
1666         return (dt_aggregate_walk_sorted(dtp, func,
1667             arg, dt_aggregate_keyvarrevcmp));
1668 }
1669
1670 int
1671 dtrace_aggregate_walk_valvarrevsorted(dtrace_hdl_t *dtp,
1672     dtrace_aggregate_f *func, void *arg)
1673 {
1674         return (dt_aggregate_walk_sorted(dtp, func,
1675             arg, dt_aggregate_valvarrevcmp));
1676 }
1677
1678 int
1679 dtrace_aggregate_walk_joined(dtrace_hdl_t *dtp, dtrace_aggvarid_t *aggvars,
1680     int naggvars, dtrace_aggregate_walk_joined_f *func, void *arg)
1681 {
1682         dt_aggregate_t *agp = &dtp->dt_aggregate;
1683         dt_ahashent_t *h, **sorted = NULL, ***bundle, **nbundle;
1684         const dtrace_aggdata_t **data;
1685         dt_ahashent_t *zaggdata = NULL;
1686         dt_ahash_t *hash = &agp->dtat_hash;
1687         size_t nentries = 0, nbundles = 0, start, zsize = 0, bundlesize;
1688         dtrace_aggvarid_t max = 0, aggvar;
1689         int rval = -1, *map, *remap = NULL;
1690         int i, j;
1691         dtrace_optval_t sortpos = dtp->dt_options[DTRACEOPT_AGGSORTPOS];
1692
1693         /*
1694          * If the sorting position is greater than the number of aggregation
1695          * variable IDs, we silently set it to 0.
1696          */
1697         if (sortpos == DTRACEOPT_UNSET || sortpos >= naggvars)
1698                 sortpos = 0;
1699
1700         /*
1701          * First we need to translate the specified aggregation variable IDs
1702          * into a linear map that will allow us to translate an aggregation
1703          * variable ID into its position in the specified aggvars.
1704          */
1705         for (i = 0; i < naggvars; i++) {
1706                 if (aggvars[i] == DTRACE_AGGVARIDNONE || aggvars[i] < 0)
1707                         return (dt_set_errno(dtp, EDT_BADAGGVAR));
1708
1709                 if (aggvars[i] > max)
1710                         max = aggvars[i];
1711         }
1712
1713         if ((map = dt_zalloc(dtp, (max + 1) * sizeof (int))) == NULL)
1714                 return (-1);
1715
1716         zaggdata = dt_zalloc(dtp, naggvars * sizeof (dt_ahashent_t));
1717
1718         if (zaggdata == NULL)
1719                 goto out;
1720
1721         for (i = 0; i < naggvars; i++) {
1722                 int ndx = i + sortpos;
1723
1724                 if (ndx >= naggvars)
1725                         ndx -= naggvars;
1726
1727                 aggvar = aggvars[ndx];
1728                 assert(aggvar <= max);
1729
1730                 if (map[aggvar]) {
1731                         /*
1732                          * We have an aggregation variable that is present
1733                          * more than once in the array of aggregation
1734                          * variables.  While it's unclear why one might want
1735                          * to do this, it's legal.  To support this construct,
1736                          * we will allocate a remap that will indicate the
1737                          * position from which this aggregation variable
1738                          * should be pulled.  (That is, where the remap will
1739                          * map from one position to another.)
1740                          */
1741                         if (remap == NULL) {
1742                                 remap = dt_zalloc(dtp, naggvars * sizeof (int));
1743
1744                                 if (remap == NULL)
1745                                         goto out;
1746                         }
1747
1748                         /*
1749                          * Given that the variable is already present, assert
1750                          * that following through the mapping and adjusting
1751                          * for the sort position yields the same aggregation
1752                          * variable ID.
1753                          */
1754                         assert(aggvars[(map[aggvar] - 1 + sortpos) %
1755                             naggvars] == aggvars[ndx]);
1756
1757                         remap[i] = map[aggvar];
1758                         continue;
1759                 }
1760
1761                 map[aggvar] = i + 1;
1762         }
1763
1764         /*
1765          * We need to take two passes over the data to size our allocation, so
1766          * we'll use the first pass to also fill in the zero-filled data to be
1767          * used to properly format a zero-valued aggregation.
1768          */
1769         for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
1770                 dtrace_aggvarid_t id;
1771                 int ndx;
1772
1773                 if ((id = dt_aggregate_aggvarid(h)) > max || !(ndx = map[id]))
1774                         continue;
1775
1776                 if (zaggdata[ndx - 1].dtahe_size == 0) {
1777                         zaggdata[ndx - 1].dtahe_size = h->dtahe_size;
1778                         zaggdata[ndx - 1].dtahe_data = h->dtahe_data;
1779                 }
1780
1781                 nentries++;
1782         }
1783
1784         if (nentries == 0) {
1785                 /*
1786                  * We couldn't find any entries; there is nothing else to do.
1787                  */
1788                 rval = 0;
1789                 goto out;
1790         }
1791
1792         /*
1793          * Before we sort the data, we're going to look for any holes in our
1794          * zero-filled data.  This will occur if an aggregation variable that
1795          * we are being asked to print has not yet been assigned the result of
1796          * any aggregating action for _any_ tuple.  The issue becomes that we
1797          * would like a zero value to be printed for all columns for this
1798          * aggregation, but without any record description, we don't know the
1799          * aggregating action that corresponds to the aggregation variable.  To
1800          * try to find a match, we're simply going to lookup aggregation IDs
1801          * (which are guaranteed to be contiguous and to start from 1), looking
1802          * for the specified aggregation variable ID.  If we find a match,
1803          * we'll use that.  If we iterate over all aggregation IDs and don't
1804          * find a match, then we must be an anonymous enabling.  (Anonymous
1805          * enablings can't currently derive either aggregation variable IDs or
1806          * aggregation variable names given only an aggregation ID.)  In this
1807          * obscure case (anonymous enabling, multiple aggregation printa() with
1808          * some aggregations not represented for any tuple), our defined
1809          * behavior is that the zero will be printed in the format of the first
1810          * aggregation variable that contains any non-zero value.
1811          */
1812         for (i = 0; i < naggvars; i++) {
1813                 if (zaggdata[i].dtahe_size == 0) {
1814                         dtrace_aggvarid_t aggvar;
1815
1816                         aggvar = aggvars[(i - sortpos + naggvars) % naggvars];
1817                         assert(zaggdata[i].dtahe_data.dtada_data == NULL);
1818
1819                         for (j = DTRACE_AGGIDNONE + 1; ; j++) {
1820                                 dtrace_aggdesc_t *agg;
1821                                 dtrace_aggdata_t *aggdata;
1822
1823                                 if (dt_aggid_lookup(dtp, j, &agg) != 0)
1824                                         break;
1825
1826                                 if (agg->dtagd_varid != aggvar)
1827                                         continue;
1828
1829                                 /*
1830                                  * We have our description -- now we need to
1831                                  * cons up the zaggdata entry for it.
1832                                  */
1833                                 aggdata = &zaggdata[i].dtahe_data;
1834                                 aggdata->dtada_size = agg->dtagd_size;
1835                                 aggdata->dtada_desc = agg;
1836                                 aggdata->dtada_handle = dtp;
1837                                 (void) dt_epid_lookup(dtp, agg->dtagd_epid,
1838                                     &aggdata->dtada_edesc,
1839                                     &aggdata->dtada_pdesc);
1840                                 aggdata->dtada_normal = 1;
1841                                 zaggdata[i].dtahe_hashval = 0;
1842                                 zaggdata[i].dtahe_size = agg->dtagd_size;
1843                                 break;
1844                         }
1845
1846                         if (zaggdata[i].dtahe_size == 0) {
1847                                 caddr_t data;
1848
1849                                 /*
1850                                  * We couldn't find this aggregation, meaning
1851                                  * that we have never seen it before for any
1852                                  * tuple _and_ this is an anonymous enabling.
1853                                  * That is, we're in the obscure case outlined
1854                                  * above.  In this case, our defined behavior
1855                                  * is to format the data in the format of the
1856                                  * first non-zero aggregation -- of which, of
1857                                  * course, we know there to be at least one
1858                                  * (or nentries would have been zero).
1859                                  */
1860                                 for (j = 0; j < naggvars; j++) {
1861                                         if (zaggdata[j].dtahe_size != 0)
1862                                                 break;
1863                                 }
1864
1865                                 assert(j < naggvars);
1866                                 zaggdata[i] = zaggdata[j];
1867
1868                                 data = zaggdata[i].dtahe_data.dtada_data;
1869                                 assert(data != NULL);
1870                         }
1871                 }
1872         }
1873
1874         /*
1875          * Now we need to allocate our zero-filled data for use for
1876          * aggregations that don't have a value corresponding to a given key.
1877          */
1878         for (i = 0; i < naggvars; i++) {
1879                 dtrace_aggdata_t *aggdata = &zaggdata[i].dtahe_data;
1880                 dtrace_aggdesc_t *aggdesc = aggdata->dtada_desc;
1881                 dtrace_recdesc_t *rec;
1882                 uint64_t larg;
1883                 caddr_t zdata;
1884
1885                 zsize = zaggdata[i].dtahe_size;
1886                 assert(zsize != 0);
1887
1888                 if ((zdata = dt_zalloc(dtp, zsize)) == NULL) {
1889                         /*
1890                          * If we failed to allocated some zero-filled data, we
1891                          * need to zero out the remaining dtada_data pointers
1892                          * to prevent the wrong data from being freed below.
1893                          */
1894                         for (j = i; j < naggvars; j++)
1895                                 zaggdata[j].dtahe_data.dtada_data = NULL;
1896                         goto out;
1897                 }
1898
1899                 aggvar = aggvars[(i - sortpos + naggvars) % naggvars];
1900
1901                 /*
1902                  * First, the easy bit.  To maintain compatibility with
1903                  * consumers that pull the compiler-generated ID out of the
1904                  * data, we put that ID at the top of the zero-filled data.
1905                  */
1906                 rec = &aggdesc->dtagd_rec[0];
1907                 /* LINTED - alignment */
1908                 *((dtrace_aggvarid_t *)(zdata + rec->dtrd_offset)) = aggvar;
1909
1910                 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
1911
1912                 /*
1913                  * Now for the more complicated part.  If (and only if) this
1914                  * is an lquantize() aggregating action, zero-filled data is
1915                  * not equivalent to an empty record:  we must also get the
1916                  * parameters for the lquantize().
1917                  */
1918                 if (rec->dtrd_action == DTRACEAGG_LQUANTIZE) {
1919                         if (aggdata->dtada_data != NULL) {
1920                                 /*
1921                                  * The easier case here is if we actually have
1922                                  * some prototype data -- in which case we
1923                                  * manually dig it out of the aggregation
1924                                  * record.
1925                                  */
1926                                 /* LINTED - alignment */
1927                                 larg = *((uint64_t *)(aggdata->dtada_data +
1928                                     rec->dtrd_offset));
1929                         } else {
1930                                 /*
1931                                  * We don't have any prototype data.  As a
1932                                  * result, we know that we _do_ have the
1933                                  * compiler-generated information.  (If this
1934                                  * were an anonymous enabling, all of our
1935                                  * zero-filled data would have prototype data
1936                                  * -- either directly or indirectly.) So as
1937                                  * gross as it is, we'll grovel around in the
1938                                  * compiler-generated information to find the
1939                                  * lquantize() parameters.
1940                                  */
1941                                 dtrace_stmtdesc_t *sdp;
1942                                 dt_ident_t *aid;
1943                                 dt_idsig_t *isp;
1944
1945                                 sdp = (dtrace_stmtdesc_t *)(uintptr_t)
1946                                     aggdesc->dtagd_rec[0].dtrd_uarg;
1947                                 aid = sdp->dtsd_aggdata;
1948                                 isp = (dt_idsig_t *)aid->di_data;
1949                                 assert(isp->dis_auxinfo != 0);
1950                                 larg = isp->dis_auxinfo;
1951                         }
1952
1953                         /* LINTED - alignment */
1954                         *((uint64_t *)(zdata + rec->dtrd_offset)) = larg;
1955                 }
1956
1957                 aggdata->dtada_data = zdata;
1958         }
1959
1960         /*
1961          * Now that we've dealt with setting up our zero-filled data, we can
1962          * allocate our sorted array, and take another pass over the data to
1963          * fill it.
1964          */
1965         sorted = dt_alloc(dtp, nentries * sizeof (dt_ahashent_t *));
1966
1967         if (sorted == NULL)
1968                 goto out;
1969
1970         for (h = hash->dtah_all, i = 0; h != NULL; h = h->dtahe_nextall) {
1971                 dtrace_aggvarid_t id;
1972
1973                 if ((id = dt_aggregate_aggvarid(h)) > max || !map[id])
1974                         continue;
1975
1976                 sorted[i++] = h;
1977         }
1978
1979         assert(i == nentries);
1980
1981         /*
1982          * We've loaded our array; now we need to sort by value to allow us
1983          * to create bundles of like value.  We're going to acquire the
1984          * dt_qsort_lock here, and hold it across all of our subsequent
1985          * comparison and sorting.
1986          */
1987         (void) pthread_mutex_lock(&dt_qsort_lock);
1988
1989         qsort(sorted, nentries, sizeof (dt_ahashent_t *),
1990             dt_aggregate_keyvarcmp);
1991
1992         /*
1993          * Now we need to go through and create bundles.  Because the number
1994          * of bundles is bounded by the size of the sorted array, we're going
1995          * to reuse the underlying storage.  And note that "bundle" is an
1996          * array of pointers to arrays of pointers to dt_ahashent_t -- making
1997          * its type (regrettably) "dt_ahashent_t ***".  (Regrettable because
1998          * '*' -- like '_' and 'X' -- should never appear in triplicate in
1999          * an ideal world.)
2000          */
2001         bundle = (dt_ahashent_t ***)sorted;
2002
2003         for (i = 1, start = 0; i <= nentries; i++) {
2004                 if (i < nentries &&
2005                     dt_aggregate_keycmp(&sorted[i], &sorted[i - 1]) == 0)
2006                         continue;
2007
2008                 /*
2009                  * We have a bundle boundary.  Everything from start to
2010                  * (i - 1) belongs in one bundle.
2011                  */
2012                 assert(i - start <= naggvars);
2013                 bundlesize = (naggvars + 2) * sizeof (dt_ahashent_t *);
2014
2015                 if ((nbundle = dt_zalloc(dtp, bundlesize)) == NULL) {
2016                         (void) pthread_mutex_unlock(&dt_qsort_lock);
2017                         goto out;
2018                 }
2019
2020                 for (j = start; j < i; j++) {
2021                         dtrace_aggvarid_t id = dt_aggregate_aggvarid(sorted[j]);
2022
2023                         assert(id <= max);
2024                         assert(map[id] != 0);
2025                         assert(map[id] - 1 < naggvars);
2026                         assert(nbundle[map[id] - 1] == NULL);
2027                         nbundle[map[id] - 1] = sorted[j];
2028
2029                         if (nbundle[naggvars] == NULL)
2030                                 nbundle[naggvars] = sorted[j];
2031                 }
2032
2033                 for (j = 0; j < naggvars; j++) {
2034                         if (nbundle[j] != NULL)
2035                                 continue;
2036
2037                         /*
2038                          * Before we assume that this aggregation variable
2039                          * isn't present (and fall back to using the
2040                          * zero-filled data allocated earlier), check the
2041                          * remap.  If we have a remapping, we'll drop it in
2042                          * here.  Note that we might be remapping an
2043                          * aggregation variable that isn't present for this
2044                          * key; in this case, the aggregation data that we
2045                          * copy will point to the zeroed data.
2046                          */
2047                         if (remap != NULL && remap[j]) {
2048                                 assert(remap[j] - 1 < j);
2049                                 assert(nbundle[remap[j] - 1] != NULL);
2050                                 nbundle[j] = nbundle[remap[j] - 1];
2051                         } else {
2052                                 nbundle[j] = &zaggdata[j];
2053                         }
2054                 }
2055
2056                 bundle[nbundles++] = nbundle;
2057                 start = i;
2058         }
2059
2060         /*
2061          * Now we need to re-sort based on the first value.
2062          */
2063         dt_aggregate_qsort(dtp, bundle, nbundles, sizeof (dt_ahashent_t **),
2064             dt_aggregate_bundlecmp);
2065
2066         (void) pthread_mutex_unlock(&dt_qsort_lock);
2067
2068         /*
2069          * We're done!  Now we just need to go back over the sorted bundles,
2070          * calling the function.
2071          */
2072         data = alloca((naggvars + 1) * sizeof (dtrace_aggdata_t *));
2073
2074         for (i = 0; i < nbundles; i++) {
2075                 for (j = 0; j < naggvars; j++)
2076                         data[j + 1] = NULL;
2077
2078                 for (j = 0; j < naggvars; j++) {
2079                         int ndx = j - sortpos;
2080
2081                         if (ndx < 0)
2082                                 ndx += naggvars;
2083
2084                         assert(bundle[i][ndx] != NULL);
2085                         data[j + 1] = &bundle[i][ndx]->dtahe_data;
2086                 }
2087
2088                 for (j = 0; j < naggvars; j++)
2089                         assert(data[j + 1] != NULL);
2090
2091                 /*
2092                  * The representative key is the last element in the bundle.
2093                  * Assert that we have one, and then set it to be the first
2094                  * element of data.
2095                  */
2096                 assert(bundle[i][j] != NULL);
2097                 data[0] = &bundle[i][j]->dtahe_data;
2098
2099                 if ((rval = func(data, naggvars + 1, arg)) == -1)
2100                         goto out;
2101         }
2102
2103         rval = 0;
2104 out:
2105         for (i = 0; i < nbundles; i++)
2106                 dt_free(dtp, bundle[i]);
2107
2108         if (zaggdata != NULL) {
2109                 for (i = 0; i < naggvars; i++)
2110                         dt_free(dtp, zaggdata[i].dtahe_data.dtada_data);
2111         }
2112
2113         dt_free(dtp, zaggdata);
2114         dt_free(dtp, sorted);
2115         dt_free(dtp, remap);
2116         dt_free(dtp, map);
2117
2118         return (rval);
2119 }
2120
2121 int
2122 dtrace_aggregate_print(dtrace_hdl_t *dtp, FILE *fp,
2123     dtrace_aggregate_walk_f *func)
2124 {
2125         dt_print_aggdata_t pd;
2126
2127         bzero(&pd, sizeof (pd));
2128
2129         pd.dtpa_dtp = dtp;
2130         pd.dtpa_fp = fp;
2131         pd.dtpa_allunprint = 1;
2132
2133         if (func == NULL)
2134                 func = dtrace_aggregate_walk_sorted;
2135
2136         if (dtp->dt_oformat) {
2137                 if ((*func)(dtp, dt_format_agg, &pd) == -1)
2138                         return (dt_set_errno(dtp, dtp->dt_errno));
2139         } else {
2140                 if ((*func)(dtp, dt_print_agg, &pd) == -1)
2141                         return (dt_set_errno(dtp, dtp->dt_errno));
2142         }
2143
2144         return (0);
2145 }
2146
2147 void
2148 dtrace_aggregate_clear(dtrace_hdl_t *dtp)
2149 {
2150         dt_aggregate_t *agp = &dtp->dt_aggregate;
2151         dt_ahash_t *hash = &agp->dtat_hash;
2152         dt_ahashent_t *h;
2153         dtrace_aggdata_t *data;
2154         dtrace_aggdesc_t *aggdesc;
2155         dtrace_recdesc_t *rec;
2156         int i, max_cpus = agp->dtat_maxcpu;
2157
2158         for (h = hash->dtah_all; h != NULL; h = h->dtahe_nextall) {
2159                 aggdesc = h->dtahe_data.dtada_desc;
2160                 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];
2161                 data = &h->dtahe_data;
2162
2163                 bzero(&data->dtada_data[rec->dtrd_offset], rec->dtrd_size);
2164
2165                 if (data->dtada_percpu == NULL)
2166                         continue;
2167
2168                 for (i = 0; i < max_cpus; i++)
2169                         bzero(data->dtada_percpu[i], rec->dtrd_size);
2170         }
2171 }
2172
2173 void
2174 dt_aggregate_destroy(dtrace_hdl_t *dtp)
2175 {
2176         dt_aggregate_t *agp = &dtp->dt_aggregate;
2177         dt_ahash_t *hash = &agp->dtat_hash;
2178         dt_ahashent_t *h, *next;
2179         dtrace_aggdata_t *aggdata;
2180         int i, max_cpus = agp->dtat_maxcpu;
2181
2182         if (hash->dtah_hash == NULL) {
2183                 assert(hash->dtah_all == NULL);
2184         } else {
2185                 free(hash->dtah_hash);
2186
2187                 for (h = hash->dtah_all; h != NULL; h = next) {
2188                         next = h->dtahe_nextall;
2189
2190                         aggdata = &h->dtahe_data;
2191
2192                         if (aggdata->dtada_percpu != NULL) {
2193                                 for (i = 0; i < max_cpus; i++)
2194                                         free(aggdata->dtada_percpu[i]);
2195                                 free(aggdata->dtada_percpu);
2196                         }
2197
2198                         free(aggdata->dtada_data);
2199                         free(h);
2200                 }
2201
2202                 hash->dtah_hash = NULL;
2203                 hash->dtah_all = NULL;
2204                 hash->dtah_size = 0;
2205         }
2206
2207         free(agp->dtat_buf.dtbd_data);
2208         free(agp->dtat_cpus);
2209 }