4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #pragma ident "%Z%%M% %I% %E% SMI"
29 #include <_string_table.h>
35 * This file provides the interfaces to build a Str_tbl suitable for use by
36 * either the sgsmsg message system, or a standard ELF string table (SHT_STRTAB)
37 * as created by ld(1).
39 * There are two modes which can be used when constructing a string table:
42 * standard string table - no compression. This is the
43 * traditional, fast method.
45 * st_new(FLG_STTAB_COMPRESS)
46 * builds a compressed string table which both eliminates
47 * duplicate strings, and permits strings with common suffixes
48 * (atexit vs. exit) to overlap in the table. This provides space
49 * savings for many string tables. Although more work than the
50 * traditional method, the algorithms used are designed to scale
51 * and keep any overhead at a minimum.
53 * These string tables are built with a common interface in a two-pass manner.
54 * The first pass finds all of the strings required for the string-table and
55 * calculates the size required for the final string table.
57 * The second pass allocates the string table, populates the strings into the
58 * table and returns the offsets the strings have been assigned.
60 * The calling sequence to build and populate a string table is:
62 * st_new(); // initialize strtab
64 * st_insert(st1); // first pass of strings ...
65 * // calculates size required for
68 * st_delstring(st?); // remove string previously
72 * st_getstrtab_sz(); // freezes strtab and computes
75 * st_setstrbuf(); // associates a final destination
76 * // for the string table
78 * st_setstring(st1); // populate the string table
79 * ... // offsets are based off of second
80 * // pass through the string table
83 * st_destroy(); // tear down string table
86 * String Suffix Compression Algorithm:
88 * Here's a quick high level overview of the Suffix String
89 * compression algorithm used. First - the heart of the algorithm
90 * is a Hash table list which represents a dictionary of all unique
91 * strings inserted into the string table. The hash function for
92 * this table is a standard string hash except that the hash starts
93 * at the last character in the string (&str[n - 1]) and works towards
94 * the first character in the function (&str[0]). As we compute the
95 * HASH value for a given string, we also compute the hash values
96 * for all of the possible suffix strings for that string.
98 * As we compute the hash - at each character see if the current
99 * suffix string for that hash is already present in the table. If
100 * it is, and the string is a master string. Then change that
101 * string to a suffix string of the new string being inserted.
103 * When the final hash value is found (hash for str[0...n]), check
104 * to see if it is in the hash table - if so increment the reference
105 * count for the string. If it is not yet in the table, insert a
106 * new hash table entry for a master string.
108 * The above method will find all suffixes of a given string given
109 * that the strings are inserted from shortest to longest. That is
110 * why this is a two phase method, we first collect all of the
111 * strings and store them based off of their length in an AVL tree.
112 * Once all of the strings have been submitted we then start the
113 * hash table build by traversing the AVL tree in order and
114 * inserting the strings from shortest to longest as described
121 avl_len_compare(const void *n1, const void *n2)
125 len1 = ((LenNode *)n1)->ln_strlen;
126 len2 = ((LenNode *)n2)->ln_strlen;
136 avl_str_compare(const void *n1, const void *n2)
138 const char *str1, *str2;
141 str1 = ((StrNode *)n1)->sn_str;
142 str2 = ((StrNode *)n2)->sn_str;
144 rc = strcmp(str1, str2);
153 * Return an initialized Str_tbl - returns NULL on failure.
156 * FLG_STTAB_COMPRESS - build a compressed string table
163 if ((stp = calloc(sizeof (Str_tbl), 1)) == NULL)
167 * Start with a leading '\0' - it's tradition.
169 stp->st_strsize = stp->st_fullstrsize = stp->st_nextoff = 1;
172 * Do we compress this string table?
174 stp->st_flags = flags;
175 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
178 if ((stp->st_lentree = calloc(sizeof (avl_tree_t), 1)) == NULL)
181 avl_create(stp->st_lentree, &avl_len_compare, sizeof (LenNode),
182 SGSOFFSETOF(LenNode, ln_avlnode));
188 * Insert a new string into the Str_tbl. There are two AVL trees used.
190 * . The first LenNode AVL tree maintains a tree of nodes based on string
192 * . Each LenNode maintains a StrNode AVL tree for each string. Large
193 * applications have been known to contribute thousands of strings of
194 * the same size. Should strings need to be removed (-z ignore), then
195 * the string AVL tree makes this removal efficient and scalable.
198 st_insert(Str_tbl *stp, const char *str)
201 StrNode *snp, sn = { 0 };
202 LenNode *lnp, ln = { 0 };
206 * String table can't have been cooked
208 assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
211 * Null strings always point to the head of the string
212 * table - no reason to keep searching.
214 if ((len = strlen(str)) == 0)
217 stp->st_fullstrsize += len + 1;
220 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
224 * From the controlling string table, determine which LenNode AVL node
225 * provides for this string length. If the node doesn't exist, insert
226 * a new node to represent this string length.
229 if ((lnp = avl_find(stp->st_lentree, &ln, &where)) == NULL) {
230 if ((lnp = calloc(sizeof (LenNode), 1)) == NULL)
232 lnp->ln_strlen = len;
233 avl_insert(stp->st_lentree, lnp, where);
235 if ((lnp->ln_strtree = calloc(sizeof (avl_tree_t), 1)) == NULL)
238 avl_create(lnp->ln_strtree, &avl_str_compare, sizeof (StrNode),
239 SGSOFFSETOF(StrNode, sn_avlnode));
243 * From the string length AVL node determine whether a StrNode AVL node
244 * provides this string. If the node doesn't exist, insert a new node
245 * to represent this string.
248 if ((snp = avl_find(lnp->ln_strtree, &sn, &where)) == NULL) {
249 if ((snp = calloc(sizeof (StrNode), 1)) == NULL)
252 avl_insert(lnp->ln_strtree, snp, where);
260 * Remove a previously inserted string from the Str_tbl.
263 st_delstring(Str_tbl *stp, const char *str)
266 LenNode *lnp, ln = { 0 };
267 StrNode *snp, sn = { 0 };
270 * String table can't have been cooked
272 assert((stp->st_flags & FLG_STTAB_COOKED) == 0);
275 stp->st_fullstrsize -= len + 1;
277 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0)
281 * Determine which LenNode AVL node provides for this string length.
284 if ((lnp = avl_find(stp->st_lentree, &ln, 0)) != NULL) {
286 if ((snp = avl_find(lnp->ln_strtree, &sn, 0)) != NULL) {
288 * Reduce the reference count, and if zero remove the
291 if (--snp->sn_refcnt == 0)
292 avl_remove(lnp->ln_strtree, snp);
298 * No strings of this length, or no string itself - someone goofed.
304 * Tear down a String_Table structure.
307 st_destroy(Str_tbl *stp)
309 Str_hash *sthash, *psthash;
310 Str_master *mstr, *pmstr;
314 * cleanup the master strings
316 for (mstr = stp->st_mstrlist, pmstr = 0; mstr;
317 mstr = mstr->sm_next) {
325 if (stp->st_hashbcks) {
326 for (i = 0; i < stp->st_hbckcnt; i++) {
327 for (sthash = stp->st_hashbcks[i], psthash = 0;
328 sthash; sthash = sthash->hi_next) {
336 free(stp->st_hashbcks);
343 * For a given string - copy it into the buffer associated with
344 * the string table - and return the offset it has been assigned.
346 * If a value of '-1' is returned - the string was not found in
350 st_setstring(Str_tbl *stp, const char *str, size_t *stoff)
359 * String table *must* have been previously cooked
361 assert(stp->st_strbuf);
363 assert(stp->st_flags & FLG_STTAB_COOKED);
366 * Null string always points to head of string table
373 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
376 stlen++; /* count for trailing '\0' */
377 _stoff = stp->st_nextoff;
379 * Have we overflowed our assigned buffer?
381 if ((_stoff + stlen) > stp->st_fullstrsize)
383 memcpy(stp->st_strbuf + _stoff, str, stlen);
385 stp->st_nextoff += stlen;
390 * Calculate reverse hash for string.
393 for (i = stlen; i >= 0; i--) {
394 hashval = ((hashval << 5) + hashval) +
395 str[i]; /* h = ((h * 33) + c) */
398 for (sthash = stp->st_hashbcks[hashval % stp->st_hbckcnt]; sthash;
399 sthash = sthash->hi_next) {
402 if (sthash->hi_hashval != hashval)
405 hstr = &sthash->hi_mstr->sm_str[sthash->hi_mstr->sm_strlen -
407 if (strcmp(str, hstr) == 0)
412 * Did we find the string?
418 * Has this string been copied into the string table?
420 mstr = sthash->hi_mstr;
421 if (mstr->sm_stroff == 0) {
422 size_t mstrlen = mstr->sm_strlen + 1;
424 mstr->sm_stroff = stp->st_nextoff;
427 * Have we overflowed our assigned buffer?
429 if ((mstr->sm_stroff + mstrlen) > stp->st_fullstrsize)
432 (void) memcpy(stp->st_strbuf + mstr->sm_stroff,
433 mstr->sm_str, mstrlen);
434 stp->st_nextoff += mstrlen;
438 * Calculate offset of (sub)string.
440 *stoff = mstr->sm_stroff + mstr->sm_strlen - sthash->hi_strlen;
447 st_hash_insert(Str_tbl *stp, const char *str, size_t len)
450 uint_t hashval = HASHSEED;
451 uint_t bckcnt = stp->st_hbckcnt;
452 Str_hash **hashbcks = stp->st_hashbcks;
454 Str_master *mstr = 0;
457 * We use a classic 'Bernstein k=33' hash function. But
458 * instead of hashing from the start of the string to the
459 * end, we do it in reverse.
461 * This way - we are essentially building all of the
462 * suffix hashvalues as we go. We can check to see if
463 * any suffixes already exist in the tree as we generate
466 for (i = len; i >= 0; i--) {
467 hashval = ((hashval << 5) + hashval) +
468 str[i]; /* h = ((h * 33) + c) */
470 for (sthash = hashbcks[hashval % bckcnt];
471 sthash; sthash = sthash->hi_next) {
475 if (sthash->hi_hashval != hashval)
478 _mstr = sthash->hi_mstr;
479 hstr = &_mstr->sm_str[_mstr->sm_strlen -
482 if (strcmp(&str[i], hstr))
487 * Entry already in table, increment refcnt and
494 * If this 'suffix' is presently a 'master
495 * string, then take over it's record.
497 if (sthash->hi_strlen == _mstr->sm_strlen) {
499 * we should only do this once.
509 * Do we need a new master string, or can we take over
510 * one we already found in the table?
514 * allocate a new master string
516 if ((mstr = calloc(sizeof (Str_hash), 1)) == 0)
518 mstr->sm_next = stp->st_mstrlist;
519 stp->st_mstrlist = mstr;
520 stp->st_strsize += len + 1;
523 * We are taking over a existing master string, the string size
524 * only increments by the difference between the current string
525 * and the previous master.
527 assert(len > mstr->sm_strlen);
528 stp->st_strsize += len - mstr->sm_strlen;
531 if ((sthash = calloc(sizeof (Str_hash), 1)) == 0)
534 mstr->sm_hashval = sthash->hi_hashval = hashval;
535 mstr->sm_strlen = sthash->hi_strlen = len;
537 sthash->hi_refcnt = 1;
538 sthash->hi_mstr = mstr;
541 * Insert string element into head of hash list
543 hashval = hashval % bckcnt;
544 sthash->hi_next = hashbcks[hashval];
545 hashbcks[hashval] = sthash;
550 * Return amount of space required for the string table.
553 st_getstrtab_sz(Str_tbl *stp)
555 assert(stp->st_fullstrsize > 0);
557 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
558 stp->st_flags |= FLG_STTAB_COOKED;
559 return (stp->st_fullstrsize);
562 if ((stp->st_flags & FLG_STTAB_COOKED) == 0) {
566 stp->st_flags |= FLG_STTAB_COOKED;
568 * allocate a hash table about the size of # of
571 stp->st_hbckcnt = findprime(stp->st_strcnt);
572 if ((stp->st_hashbcks =
573 calloc(sizeof (Str_hash), stp->st_hbckcnt)) == NULL)
577 * We now walk all of the strings in the list, from shortest to
578 * longest, and insert them into the hashtable.
580 if ((lnp = avl_first(stp->st_lentree)) == NULL) {
582 * Is it possible we have an empty string table, if so,
583 * the table still contains '\0', so return the size.
585 if (avl_numnodes(stp->st_lentree) == 0) {
586 assert(stp->st_strsize == 1);
587 return (stp->st_strsize);
596 * Walk the string lists and insert them into the hash
597 * list. Once a string is inserted we no longer need
598 * it's entry, so the string can be freed.
600 for (snp = avl_first(lnp->ln_strtree); snp;
601 snp = AVL_NEXT(lnp->ln_strtree, snp)) {
602 if (st_hash_insert(stp, snp->sn_str,
603 lnp->ln_strlen) == -1)
608 * Now that the strings have been copied, walk the
609 * StrNode tree and free all the AVL nodes. Note,
610 * avl_destroy_nodes() beats avl_remove() as the
611 * latter balances the nodes as they are removed.
612 * We just want to tear the whole thing down fast.
615 while ((snp = avl_destroy_nodes(lnp->ln_strtree,
618 avl_destroy(lnp->ln_strtree);
619 free(lnp->ln_strtree);
620 lnp->ln_strtree = NULL;
623 * Move on to the next LenNode.
625 lnp = AVL_NEXT(stp->st_lentree, lnp);
629 * Now that all of the strings have been freed, walk the
630 * LenNode tree and free all of the AVL nodes. Note,
631 * avl_destroy_nodes() beats avl_remove() as the latter
632 * balances the nodes as they are removed. We just want to
633 * tear the whole thing down fast.
636 while ((lnp = avl_destroy_nodes(stp->st_lentree,
639 avl_destroy(stp->st_lentree);
640 free(stp->st_lentree);
644 assert(stp->st_strsize > 0);
645 assert(stp->st_fullstrsize >= stp->st_strsize);
647 return (stp->st_strsize);
651 * Associate a buffer with a string table.
654 st_getstrbuf(Str_tbl *stp)
656 return (stp->st_strbuf);
660 st_setstrbuf(Str_tbl *stp, char *stbuf, size_t bufsize)
662 assert(stp->st_flags & FLG_STTAB_COOKED);
664 if ((stp->st_flags & FLG_STTAB_COMPRESS) == 0) {
665 if (bufsize < stp->st_fullstrsize)
668 if (bufsize < stp->st_strsize)
672 stp->st_strbuf = stbuf;
675 * for debug builds - start with a stringtable filled in
676 * with '0xff'. This makes it very easy to find wholes
677 * which we failed to fill in - in the strtab.
679 memset(stbuf, 0xff, bufsize);
682 memset(stbuf, 0x0, bufsize);