contrib/awk/array.c

   1 /*
   2  * array.c - routines for associative arrays.
   3  */
   4
   5 /*
   6  * Copyright (C) 1986, 1988, 1989, 1991 - 97 the Free Software Foundation, Inc.
   7  *
   8  * This file is part of GAWK, the GNU implementation of the
   9  * AWK Programming Language.
  10  *
  11  * GAWK is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * GAWK is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
  24  */
  25
  26 /*
  27  * Tree walks (``for (iggy in foo)'') and array deletions use expensive
  28  * linear searching.  So what we do is start out with small arrays and
  29  * grow them as needed, so that our arrays are hopefully small enough,
  30  * most of the time, that they're pretty full and we're not looking at
  31  * wasted space.
  32  *
  33  * The decision is made to grow the array if the average chain length is
  34  * ``too big''. This is defined as the total number of entries in the table
  35  * divided by the size of the array being greater than some constant.
  36  */
  37
  38 #define AVG_CHAIN_MAX   10   /* don't want to linear search more than this */
  39
  40 #include "awk.h"
  41
  42 static NODE *assoc_find P((NODE *symbol, NODE *subs, int hash1));
  43 static void grow_table P((NODE *symbol));
  44
  45 /* concat_exp --- concatenate expression list into a single string */
  46
  47 NODE *
  48 concat_exp(tree)
  49 register NODE *tree;
  50 {
  51         register NODE *r;
  52         char *str;
  53         char *s;
  54         size_t len;
  55         int offset;
  56         size_t subseplen;
  57         char *subsep;
  58
  59         if (tree->type != Node_expression_list)
  60                 return force_string(tree_eval(tree));
  61         r = force_string(tree_eval(tree->lnode));
  62         if (tree->rnode == NULL)
  63                 return r;
  64         subseplen = SUBSEP_node->lnode->stlen;
  65         subsep = SUBSEP_node->lnode->stptr;
  66         len = r->stlen + subseplen + 2;
  67         emalloc(str, char *, len, "concat_exp");
  68         memcpy(str, r->stptr, r->stlen+1);
  69         s = str + r->stlen;
  70         free_temp(r);
  71         for (tree = tree->rnode; tree != NULL; tree = tree->rnode) {
  72                 if (subseplen == 1)
  73                         *s++ = *subsep;
  74                 else {
  75                         memcpy(s, subsep, subseplen+1);
  76                         s += subseplen;
  77                 }
  78                 r = force_string(tree_eval(tree->lnode));
  79                 len += r->stlen + subseplen;
  80                 offset = s - str;
  81                 erealloc(str, char *, len, "concat_exp");
  82                 s = str + offset;
  83                 memcpy(s, r->stptr, r->stlen+1);
  84                 s += r->stlen;
  85                 free_temp(r);
  86         }
  87         r = make_str_node(str, s - str, ALREADY_MALLOCED);
  88         r->flags |= TEMP;
  89         return r;
  90 }
  91
  92 /* assoc_clear --- flush all the values in symbol[] before doing a split() */
  93
  94 void
  95 assoc_clear(symbol)
  96 NODE *symbol;
  97 {
  98         int i;
  99         NODE *bucket, *next;
 100
 101         if (symbol->var_array == NULL)
 102                 return;
 103         for (i = 0; i < symbol->array_size; i++) {
 104                 for (bucket = symbol->var_array[i]; bucket != NULL; bucket = next) {
 105                         next = bucket->ahnext;
 106                         unref(bucket->ahname);
 107                         unref(bucket->ahvalue);
 108                         freenode(bucket);
 109                 }
 110                 symbol->var_array[i] = NULL;
 111         }
 112         free(symbol->var_array);
 113         symbol->var_array = NULL;
 114         symbol->array_size = symbol->table_size = 0;
 115         symbol->flags &= ~ARRAYMAXED;
 116 }
 117
 118 /* hash --- calculate the hash function of the string in subs */
 119
 120 unsigned int
 121 hash(s, len, hsize)
 122 register const char *s;
 123 register size_t len;
 124 unsigned long hsize;
 125 {
 126         register unsigned long h = 0;
 127
 128         /*
 129          * This is INCREDIBLY ugly, but fast.  We break the string up into
 130          * 8 byte units.  On the first time through the loop we get the
 131          * "leftover bytes" (strlen % 8).  On every other iteration, we
 132          * perform 8 HASHC's so we handle all 8 bytes.  Essentially, this
 133          * saves us 7 cmp & branch instructions.  If this routine is
 134          * heavily used enough, it's worth the ugly coding.
 135          *
 136          * OZ's original sdbm hash, copied from Margo Seltzers db package.
 137          */
 138
 139         /*
 140          * Even more speed:
 141          * #define HASHC   h = *s++ + 65599 * h
 142          * Because 65599 = pow(2, 6) + pow(2, 16) - 1 we multiply by shifts
 143          */
 144 #define HASHC   htmp = (h << 6);  \
 145                 h = *s++ + htmp + (htmp << 10) - h
 146
 147         unsigned long htmp;
 148
 149         h = 0;
 150
 151 #if defined(VAXC)
 152         /*
 153          * This was an implementation of "Duff's Device", but it has been
 154          * redone, separating the switch for extra iterations from the
 155          * loop. This is necessary because the DEC VAX-C compiler is
 156          * STOOPID.
 157          */
 158         switch (len & (8 - 1)) {
 159         case 7:         HASHC;
 160         case 6:         HASHC;
 161         case 5:         HASHC;
 162         case 4:         HASHC;
 163         case 3:         HASHC;
 164         case 2:         HASHC;
 165         case 1:         HASHC;
 166         default:        break;
 167         }
 168
 169         if (len > (8 - 1)) {
 170                 register size_t loop = len >> 3;
 171                 do {
 172                         HASHC;
 173                         HASHC;
 174                         HASHC;
 175                         HASHC;
 176                         HASHC;
 177                         HASHC;
 178                         HASHC;
 179                         HASHC;
 180                 } while (--loop);
 181         }
 182 #else /* ! VAXC */
 183         /* "Duff's Device" for those who can handle it */
 184         if (len > 0) {
 185                 register size_t loop = (len + 8 - 1) >> 3;
 186
 187                 switch (len & (8 - 1)) {
 188                 case 0:
 189                         do {    /* All fall throughs */
 190                                 HASHC;
 191                 case 7:         HASHC;
 192                 case 6:         HASHC;
 193                 case 5:         HASHC;
 194                 case 4:         HASHC;
 195                 case 3:         HASHC;
 196                 case 2:         HASHC;
 197                 case 1:         HASHC;
 198                         } while (--loop);
 199                 }
 200         }
 201 #endif /* ! VAXC */
 202
 203         if (h >= hsize)
 204                 h %= hsize;
 205         return h;
 206 }
 207
 208 /* assoc_find --- locate symbol[subs] */
 209
 210 static NODE *                           /* NULL if not found */
 211 assoc_find(symbol, subs, hash1)
 212 NODE *symbol;
 213 register NODE *subs;
 214 int hash1;
 215 {
 216         register NODE *bucket;
 217
 218         for (bucket = symbol->var_array[hash1]; bucket != NULL;
 219                         bucket = bucket->ahnext) {
 220                 if (cmp_nodes(bucket->ahname, subs) == 0)
 221                         return bucket;
 222         }
 223         return NULL;
 224 }
 225
 226 /* in_array --- test whether the array element symbol[subs] exists or not */
 227
 228 int
 229 in_array(symbol, subs)
 230 NODE *symbol, *subs;
 231 {
 232         register int hash1;
 233         int ret;
 234
 235         if (symbol->type == Node_param_list)
 236                 symbol = stack_ptr[symbol->param_cnt];
 237         if ((symbol->flags & SCALAR) != 0)
 238                 fatal("attempt to use scalar as array");
 239         /*
 240          * evaluate subscript first, it could have side effects
 241          */
 242         subs = concat_exp(subs);        /* concat_exp returns a string node */
 243         if (symbol->var_array == NULL) {
 244                 free_temp(subs);
 245                 return 0;
 246         }
 247         hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size);
 248         ret = (assoc_find(symbol, subs, hash1) != NULL);
 249         free_temp(subs);
 250         return ret;
 251 }
 252
 253 /*
 254  * assoc_lookup:
 255  * Find SYMBOL[SUBS] in the assoc array.  Install it with value "" if it
 256  * isn't there. Returns a pointer ala get_lhs to where its value is stored.
 257  *
 258  * SYMBOL is the address of the node (or other pointer) being dereferenced.
 259  * SUBS is a number or string used as the subscript.
 260  */
 261
 262 NODE **
 263 assoc_lookup(symbol, subs)
 264 NODE *symbol, *subs;
 265 {
 266         register int hash1;
 267         register NODE *bucket;
 268
 269         (void) force_string(subs);
 270
 271         if ((symbol->flags & SCALAR) != 0)
 272                 fatal("attempt to use scalar as array");
 273
 274         if (symbol->var_array == NULL) {
 275                 symbol->type = Node_var_array;
 276                 symbol->array_size = symbol->table_size = 0;    /* sanity */
 277                 symbol->flags &= ~ARRAYMAXED;
 278                 grow_table(symbol);
 279                 hash1 = hash(subs->stptr, subs->stlen,
 280                                 (unsigned long) symbol->array_size);
 281         } else {
 282                 hash1 = hash(subs->stptr, subs->stlen,
 283                                 (unsigned long) symbol->array_size);
 284                 bucket = assoc_find(symbol, subs, hash1);
 285                 if (bucket != NULL) {
 286                         free_temp(subs);
 287                         return &(bucket->ahvalue);
 288                 }
 289         }
 290
 291         /* It's not there, install it. */
 292         if (do_lint && subs->stlen == 0)
 293                 warning("subscript of array `%s' is null string",
 294                         symbol->vname);
 295
 296         /* first see if we would need to grow the array, before installing */
 297         symbol->table_size++;
 298         if ((symbol->flags & ARRAYMAXED) == 0
 299             && symbol->table_size/symbol->array_size > AVG_CHAIN_MAX) {
 300                 grow_table(symbol);
 301                 /* have to recompute hash value for new size */
 302                 hash1 = hash(subs->stptr, subs->stlen,
 303                                 (unsigned long) symbol->array_size);
 304         }
 305
 306         getnode(bucket);
 307         bucket->type = Node_ahash;
 308         if (subs->flags & TEMP)
 309                 bucket->ahname = dupnode(subs);
 310         else {
 311                 unsigned int saveflags = subs->flags;
 312
 313                 subs->flags &= ~MALLOC;
 314                 bucket->ahname = dupnode(subs);
 315                 subs->flags = saveflags;
 316         }
 317         free_temp(subs);
 318
 319         /* array subscripts are strings */
 320         bucket->ahname->flags &= ~NUMBER;
 321         bucket->ahname->flags |= STRING;
 322         bucket->ahvalue = Nnull_string;
 323         bucket->ahnext = symbol->var_array[hash1];
 324         symbol->var_array[hash1] = bucket;
 325         return &(bucket->ahvalue);
 326 }
 327
 328 /* do_delete --- perform `delete array[s]' */
 329
 330 void
 331 do_delete(symbol, tree)
 332 NODE *symbol, *tree;
 333 {
 334         register int hash1;
 335         register NODE *bucket, *last;
 336         NODE *subs;
 337
 338         if (symbol->type == Node_param_list) {
 339                 symbol = stack_ptr[symbol->param_cnt];
 340                 if (symbol->type == Node_var)
 341                         return;
 342         }
 343         if (symbol->type == Node_var_array) {
 344                 if (symbol->var_array == NULL)
 345                         return;
 346         } else
 347                 fatal("delete: illegal use of variable `%s' as array",
 348                         symbol->vname);
 349
 350         if (tree == NULL) {     /* delete array */
 351                 assoc_clear(symbol);
 352                 return;
 353         }
 354
 355         subs = concat_exp(tree);        /* concat_exp returns string node */
 356         hash1 = hash(subs->stptr, subs->stlen, (unsigned long) symbol->array_size);
 357
 358         last = NULL;
 359         for (bucket = symbol->var_array[hash1]; bucket != NULL;
 360                         last = bucket, bucket = bucket->ahnext)
 361                 if (cmp_nodes(bucket->ahname, subs) == 0)
 362                         break;
 363         free_temp(subs);
 364         if (bucket == NULL) {
 365                 if (do_lint)
 366                         warning("delete: index `%s' not in array `%s'",
 367                                 subs->stptr, symbol->vname);
 368                 return;
 369         }
 370         if (last != NULL)
 371                 last->ahnext = bucket->ahnext;
 372         else
 373                 symbol->var_array[hash1] = bucket->ahnext;
 374         unref(bucket->ahname);
 375         unref(bucket->ahvalue);
 376         freenode(bucket);
 377         symbol->table_size--;
 378         if (symbol->table_size <= 0) {
 379                 memset(symbol->var_array, '\0',
 380                         sizeof(NODE *) * symbol->array_size);
 381                 symbol->table_size = symbol->array_size = 0;
 382                 symbol->flags &= ~ARRAYMAXED;
 383                 free((char *) symbol->var_array);
 384                 symbol->var_array = NULL;
 385         }
 386 }
 387
 388 /* assoc_scan --- start a ``for (iggy in foo)'' loop */
 389
 390 void
 391 assoc_scan(symbol, lookat)
 392 NODE *symbol;
 393 struct search *lookat;
 394 {
 395         lookat->sym = symbol;
 396         lookat->idx = 0;
 397         lookat->bucket = NULL;
 398         lookat->retval = NULL;
 399         if (symbol->var_array != NULL)
 400                 assoc_next(lookat);
 401 }
 402
 403 /* assoc_next --- actually find the next element in array */
 404
 405 void
 406 assoc_next(lookat)
 407 struct search *lookat;
 408 {
 409         register NODE *symbol = lookat->sym;
 410
 411         if (symbol == NULL)
 412                 fatal("null symbol in assoc_next");
 413         if (symbol->var_array == NULL || lookat->idx > symbol->array_size) {
 414                 lookat->retval = NULL;
 415                 return;
 416         }
 417         /*
 418          * This is theoretically unsafe.  The element bucket might have
 419          * been freed if the body of the scan did a delete on the next
 420          * element of the bucket.  The only way to do that is by array
 421          * reference, which is unlikely.  Basically, if the user is doing
 422          * anything other than an operation on the current element of an
 423          * assoc array while walking through it sequentially, all bets are
 424          * off.  (The safe way is to register all search structs on an
 425          * array with the array, and update all of them on a delete or
 426          * insert)
 427          */
 428         if (lookat->bucket != NULL) {
 429                 lookat->retval = lookat->bucket->ahname;
 430                 lookat->bucket = lookat->bucket->ahnext;
 431                 return;
 432         }
 433         for (; lookat->idx < symbol->array_size; lookat->idx++) {
 434                 NODE *bucket;
 435
 436                 if ((bucket = symbol->var_array[lookat->idx]) != NULL) {
 437                         lookat->retval = bucket->ahname;
 438                         lookat->bucket = bucket->ahnext;
 439                         lookat->idx++;
 440                         return;
 441                 }
 442         }
 443         lookat->retval = NULL;
 444         lookat->bucket = NULL;
 445         return;
 446 }
 447
 448 /* grow_table --- grow a hash table */
 449
 450 static void
 451 grow_table(symbol)
 452 NODE *symbol;
 453 {
 454         NODE **old, **new, *chain, *next;
 455         int i, j;
 456         unsigned long hash1;
 457         unsigned long oldsize, newsize;
 458         /*
 459          * This is an array of primes. We grow the table by an order of
 460          * magnitude each time (not just doubling) so that growing is a
 461          * rare operation. We expect, on average, that it won't happen
 462          * more than twice.  The final size is also chosen to be small
 463          * enough so that MS-DOG mallocs can handle it. When things are
 464          * very large (> 8K), we just double more or less, instead of
 465          * just jumping from 8K to 64K.
 466          */
 467         static long sizes[] = { 13, 127, 1021, 8191, 16381, 32749, 65497,
 468 #if ! defined(MSDOS) && ! defined(OS2) && ! defined(atarist)
 469                                 131101, 262147, 524309, 1048583, 2097169,
 470                                 4194319, 8388617, 16777259, 33554467,
 471                                 67108879, 134217757, 268435459, 536870923,
 472                                 1073741827
 473 #endif
 474         };
 475
 476         /* find next biggest hash size */
 477         newsize = oldsize = symbol->array_size;
 478         for (i = 0, j = sizeof(sizes)/sizeof(sizes[0]); i < j; i++) {
 479                 if (oldsize < sizes[i]) {
 480                         newsize = sizes[i];
 481                         break;
 482                 }
 483         }
 484
 485         if (newsize == oldsize) {       /* table already at max (!) */
 486                 symbol->flags |= ARRAYMAXED;
 487                 return;
 488         }
 489
 490         /* allocate new table */
 491         emalloc(new, NODE **, newsize * sizeof(NODE *), "grow_table");
 492         memset(new, '\0', newsize * sizeof(NODE *));
 493
 494         /* brand new hash table, set things up and return */
 495         if (symbol->var_array == NULL) {
 496                 symbol->table_size = 0;
 497                 goto done;
 498         }
 499
 500         /* old hash table there, move stuff to new, free old */
 501         old = symbol->var_array;
 502         for (i = 0; i < oldsize; i++) {
 503                 if (old[i] == NULL)
 504                         continue;
 505
 506                 for (chain = old[i]; chain != NULL; chain = next) {
 507                         next = chain->ahnext;
 508                         hash1 = hash(chain->ahname->stptr,
 509                                         chain->ahname->stlen, newsize);
 510
 511                         /* remove from old list, add to new */
 512                         chain->ahnext = new[hash1];
 513                         new[hash1] = chain;
 514
 515                 }
 516         }
 517         free(old);
 518
 519 done:
 520         /*
 521          * note that symbol->table_size does not change if an old array,
 522          * and is explicitly set to 0 if a new one.
 523          */
 524         symbol->var_array = new;
 525         symbol->array_size = newsize;
 526 }