contrib/ntp/libopts/tokenize.c

   1 /*
   2  *  This file defines the string_tokenize interface
   3  * Time-stamp:      "2006-06-24 15:27:49 bkorb"
   4  *
   5  *  string_tokenize copyright 2005 Bruce Korb
   6  *
   7  *  string_tokenize is free software; you can redistribute it and/or
   8  *  modify it under the terms of the GNU Lesser General Public
   9  *  License as published by the Free Software Foundation; either
  10  *  version 2.1 of the License, or (at your option) any later version.
  11  *
  12  *  string_tokenize is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  Lesser General Public License for more details.
  16  *
  17  *  You should have received a copy of the GNU Lesser General Public
  18  *  License along with string_tokenize; if not, write to:
  19  *             The Free Software Foundation, Inc.,
  20  *             51 Franklin Street, Fifth Floor,
  21  *             Boston, MA  02110-1301, USA.
  22  */
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <stdlib.h>
  26
  27 #define cc_t   const unsigned char
  28 #define ch_t   unsigned char
  29
  30 /* = = = START-STATIC-FORWARD = = = */
  31 /* static forward declarations maintained by :mkfwd */
  32 static void
  33 copy_cooked( ch_t** ppDest, char const ** ppSrc );
  34
  35 static void
  36 copy_raw( ch_t** ppDest, char const ** ppSrc );
  37 /* = = = END-STATIC-FORWARD = = = */
  38
  39 static void
  40 copy_cooked( ch_t** ppDest, char const ** ppSrc )
  41 {
  42     ch_t* pDest = (ch_t*)*ppDest;
  43     const ch_t* pSrc  = (const ch_t*)(*ppSrc + 1);
  44
  45     for (;;) {
  46         ch_t ch = *(pSrc++);
  47         switch (ch) {
  48         case NUL:   *ppSrc = NULL; return;
  49         case '"':   goto done;
  50         case '\\':
  51             pSrc += ao_string_cook_escape_char( (char*)pSrc, (char*)&ch, 0x7F );
  52             if (ch == 0x7F)
  53                 break;
  54             /* FALLTHROUGH */
  55
  56         default:
  57             *(pDest++) = ch;
  58         }
  59     }
  60
  61  done:
  62     *ppDest = (ch_t*)pDest; /* next spot for storing character */
  63     *ppSrc  = (char const *)pSrc;  /* char following closing quote    */
  64 }
  65
  66
  67 static void
  68 copy_raw( ch_t** ppDest, char const ** ppSrc )
  69 {
  70     ch_t* pDest = *ppDest;
  71     cc_t* pSrc  = (cc_t*) (*ppSrc + 1);
  72
  73     for (;;) {
  74         ch_t ch = *(pSrc++);
  75         switch (ch) {
  76         case NUL:   *ppSrc = NULL; return;
  77         case '\'':  goto done;
  78         case '\\':
  79             /*
  80              *  *Four* escapes are handled:  newline removal, escape char
  81              *  quoting and apostrophe quoting
  82              */
  83             switch (*pSrc) {
  84             case NUL:   *ppSrc = NULL; return;
  85             case '\r':
  86                 if (*(++pSrc) == '\n')
  87                     ++pSrc;
  88                 continue;
  89
  90             case '\n':
  91                 ++pSrc;
  92                 continue;
  93
  94             case '\'':
  95                 ch = '\'';
  96                 /* FALLTHROUGH */
  97
  98             case '\\':
  99                 ++pSrc;
 100                 break;
 101             }
 102             /* FALLTHROUGH */
 103
 104         default:
 105             *(pDest++) = ch;
 106         }
 107     }
 108
 109  done:
 110     *ppDest = pDest; /* next spot for storing character */
 111     *ppSrc  = (char const *) pSrc;  /* char following closing quote    */
 112 }
 113
 114
 115 /*=export_func ao_string_tokenize
 116  *
 117  * what: tokenize an input string
 118  *
 119  * arg:  + char const* + string + string to be tokenized +
 120  *
 121  * ret_type:  token_list_t*
 122  * ret_desc:  pointer to a structure that lists each token
 123  *
 124  * doc:
 125  *
 126  * This function will convert one input string into a list of strings.
 127  * The list of strings is derived by separating the input based on
 128  * white space separation.  However, if the input contains either single
 129  * or double quote characters, then the text after that character up to
 130  * a matching quote will become the string in the list.
 131  *
 132  *  The returned pointer should be deallocated with @code{free(3C)} when
 133  *  are done using the data.  The data are placed in a single block of
 134  *  allocated memory.  Do not deallocate individual token/strings.
 135  *
 136  *  The structure pointed to will contain at least these two fields:
 137  *  @table @samp
 138  *  @item tkn_ct
 139  *  The number of tokens found in the input string.
 140  *  @item tok_list
 141  *  An array of @code{tkn_ct + 1} pointers to substring tokens, with
 142  *  the last pointer set to NULL.
 143  *  @end table
 144  *
 145  * There are two types of quoted strings: single quoted (@code{'}) and
 146  * double quoted (@code{"}).  Singly quoted strings are fairly raw in that
 147  * escape characters (@code{\\}) are simply another character, except when
 148  * preceding the following characters:
 149  * @example
 150  * @code{\\}  double backslashes reduce to one
 151  * @code{'}   incorporates the single quote into the string
 152  * @code{\n}  suppresses both the backslash and newline character
 153  * @end example
 154  *
 155  * Double quote strings are formed according to the rules of string
 156  * constants in ANSI-C programs.
 157  *
 158  * example:
 159  * @example
 160  *    #include <stdlib.h>
 161  *    int ix;
 162  *    token_list_t* ptl = ao_string_tokenize( some_string )
 163  *    for (ix = 0; ix < ptl->tkn_ct; ix++)
 164  *       do_something_with_tkn( ptl->tkn_list[ix] );
 165  *    free( ptl );
 166  * @end example
 167  * Note that everything is freed with the one call to @code{free(3C)}.
 168  *
 169  * err:
 170  *  NULL is returned and @code{errno} will be set to indicate the problem:
 171  *  @itemize @bullet
 172  *  @item
 173  *  @code{EINVAL} - There was an unterminated quoted string.
 174  *  @item
 175  *  @code{ENOENT} - The input string was empty.
 176  *  @item
 177  *  @code{ENOMEM} - There is not enough memory.
 178  *  @end itemize
 179 =*/
 180 token_list_t*
 181 ao_string_tokenize( char const* str )
 182 {
 183     int max_token_ct = 1; /* allow for trailing NUL on string */
 184     token_list_t* res;
 185
 186     if (str == NULL)  goto bogus_str;
 187
 188     /*
 189      *  Trim leading white space.  Use "ENOENT" and a NULL return to indicate
 190      *  an empty string was passed.
 191      */
 192     while (isspace( (ch_t)*str ))  str++;
 193     if (*str == NUL) {
 194     bogus_str:
 195         errno = ENOENT;
 196         return NULL;
 197     }
 198
 199     /*
 200      *  Take an approximate count of tokens.  If no quoted strings are used,
 201      *  it will be accurate.  If quoted strings are used, it will be a little
 202      *  high and we'll squander the space for a few extra pointers.
 203      */
 204     {
 205         cc_t* pz = (cc_t*)str;
 206
 207         do {
 208             max_token_ct++;
 209             while (! isspace( *++pz ))
 210                 if (*pz == NUL) goto found_nul;
 211             while (isspace( *pz ))  pz++;
 212         } while (*pz != NUL);
 213
 214     found_nul:
 215         ;
 216     }
 217
 218     res = malloc( sizeof(*res) + strlen(str) + (max_token_ct * sizeof(ch_t*)) );
 219     if (res == NULL) {
 220         errno = ENOMEM;
 221         return res;
 222     }
 223
 224     /*
 225      *  Now copy each token into the output buffer.
 226      */
 227     {
 228         ch_t* pzDest = (ch_t*)(res->tkn_list + (max_token_ct + 1));
 229         res->tkn_ct  = 0;
 230
 231         do  {
 232             res->tkn_list[ res->tkn_ct++ ] = pzDest;
 233             for (;;) {
 234                 int ch = (ch_t)*str;
 235                 if (isspace( ch )) {
 236                 found_white_space:
 237                     while (isspace( (ch_t)*++str ))  ;
 238                     break;
 239                 }
 240
 241                 switch (ch) {
 242                 case '"':
 243                     copy_cooked( &pzDest, &str );
 244                     if (str == NULL) {
 245                         free(res);
 246                         errno = EINVAL;
 247                         return NULL;
 248                     }
 249                     if (isspace( (ch_t)*str ))
 250                         goto found_white_space;
 251                     break;
 252
 253                 case '\'':
 254                     copy_raw( &pzDest, &str );
 255                     if (str == NULL) {
 256                         free(res);
 257                         errno = EINVAL;
 258                         return NULL;
 259                     }
 260                     if (isspace( (ch_t)*str ))
 261                         goto found_white_space;
 262                     break;
 263
 264                 case NUL:
 265                     goto copy_done;
 266
 267                 default:
 268                     str++;
 269                     *(pzDest++) = ch;
 270                 }
 271             } copy_done:;
 272
 273             /*
 274              * NUL terminate the last token and see if we have any more tokens.
 275              */
 276             *(pzDest++) = NUL;
 277         } while (*str != NUL);
 278
 279         res->tkn_list[ res->tkn_ct ] = NULL;
 280     }
 281
 282     return res;
 283 }
 284
 285 #ifdef TEST
 286 #include <stdio.h>
 287 #include <string.h>
 288
 289 int
 290 main( int argc, char** argv )
 291 {
 292     if (argc == 1) {
 293         printf("USAGE:  %s arg [ ... ]\n", *argv);
 294         return 1;
 295     }
 296     while (--argc > 0) {
 297         char* arg = *(++argv);
 298         token_list_t* p = ao_string_tokenize( arg );
 299         if (p == NULL) {
 300             printf( "Parsing string ``%s'' failed:\n\terrno %d (%s)\n",
 301                     arg, errno, strerror( errno ));
 302         } else {
 303             int ix = 0;
 304             printf( "Parsed string ``%s''\ninto %d tokens:\n", arg, p->tkn_ct );
 305             do {
 306                 printf( " %3d:  ``%s''\n", ix+1, p->tkn_list[ix] );
 307             } while (++ix < p->tkn_ct);
 308             free(p);
 309         }
 310     }
 311     return 0;
 312 }
 313 #endif
 314
 315 /*
 316  * Local Variables:
 317  * mode: C
 318  * c-file-style: "stroustrup"
 319  * indent-tabs-mode: nil
 320  * End:
 321  * end of autoopts/tokenize.c */