usr.bin/indent/lexi.c

   1 /*-
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. All advertising materials mentioning features or use of this software
  16  *    must display the following acknowledgement:
  17  *      This product includes software developed by the University of
  18  *      California, Berkeley and its contributors.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35
  36 #if 0
  37 #ifndef lint
  38 static char sccsid[] = "@(#)lexi.c      8.1 (Berkeley) 6/6/93";
  39 #endif /* not lint */
  40 #endif
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 /*
  45  * Here we have the token scanner for indent.  It scans off one token and puts
  46  * it in the global variable "token".  It returns a code, indicating the type
  47  * of token scanned.
  48  */
  49
  50 #include <err.h>
  51 #include <stdio.h>
  52 #include <ctype.h>
  53 #include <stdlib.h>
  54 #include <string.h>
  55 #include "indent_globs.h"
  56 #include "indent_codes.h"
  57 #include "indent.h"
  58
  59 #define alphanum 1
  60 #ifdef undef
  61 #define opchar 3
  62 #endif
  63
  64 struct templ {
  65     const char *rwd;
  66     int         rwcode;
  67 };
  68
  69 /*
  70  * This table has to be sorted alphabetically, because it'll be used in binary
  71  * search. For the same reason, string must be the first thing in struct templ.
  72  */
  73 struct templ specials[] =
  74 {
  75     {"auto", 10},
  76     {"break", 9},
  77     {"case", 8},
  78     {"char", 4},
  79     {"const", 4},
  80     {"default", 8},
  81     {"do", 6},
  82     {"double", 4},
  83     {"else", 6},
  84     {"enum", 3},
  85     {"extern", 10},
  86     {"float", 4},
  87     {"for", 5},
  88     {"global", 4},
  89     {"goto", 9},
  90     {"if", 5},
  91     {"int", 4},
  92     {"long", 4},
  93     {"offsetof", 1},
  94     {"register", 10},
  95     {"return", 9},
  96     {"short", 4},
  97     {"sizeof", 2},
  98     {"static", 10},
  99     {"struct", 3},
 100     {"switch", 7},
 101     {"typedef", 10},
 102     {"union", 3},
 103     {"unsigned", 4},
 104     {"void", 4},
 105     {"volatile", 4},
 106     {"while", 5}
 107 };
 108
 109 const char **typenames;
 110 int         typename_count;
 111 int         typename_top = -1;
 112
 113 char        chartype[128] =
 114 {                               /* this is used to facilitate the decision of
 115                                  * what type (alphanumeric, operator) each
 116                                  * character is */
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118     0, 0, 0, 0, 0, 0, 0, 0,
 119     0, 0, 0, 0, 0, 0, 0, 0,
 120     0, 0, 0, 0, 0, 0, 0, 0,
 121     0, 3, 0, 0, 1, 3, 3, 0,
 122     0, 0, 3, 3, 0, 3, 0, 3,
 123     1, 1, 1, 1, 1, 1, 1, 1,
 124     1, 1, 0, 0, 3, 3, 3, 3,
 125     0, 1, 1, 1, 1, 1, 1, 1,
 126     1, 1, 1, 1, 1, 1, 1, 1,
 127     1, 1, 1, 1, 1, 1, 1, 1,
 128     1, 1, 1, 0, 0, 0, 3, 1,
 129     0, 1, 1, 1, 1, 1, 1, 1,
 130     1, 1, 1, 1, 1, 1, 1, 1,
 131     1, 1, 1, 1, 1, 1, 1, 1,
 132     1, 1, 1, 0, 3, 0, 3, 0
 133 };
 134
 135 static int
 136 strcmp_type(const void *e1, const void *e2)
 137 {
 138     return (strcmp(e1, *(const char * const *)e2));
 139 }
 140
 141 int
 142 lexi(void)
 143 {
 144     int         unary_delim;    /* this is set to 1 if the current token
 145                                  * forces a following operator to be unary */
 146     static int  last_code;      /* the last token type returned */
 147     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 148     int         code;           /* internal code to be returned */
 149     char        qchar;          /* the delimiter character for a string */
 150
 151     e_token = s_token;          /* point to start of place to save token */
 152     unary_delim = false;
 153     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 154                                  * column 1 iff the last thing scanned was nl */
 155     ps.last_nl = false;
 156
 157     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 158         ps.col_1 = false;       /* leading blanks imply token is not in column
 159                                  * 1 */
 160         if (++buf_ptr >= buf_end)
 161             fill_buffer();
 162     }
 163
 164     /* Scan an alphanumeric token */
 165     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 166         /*
 167          * we have a character or number
 168          */
 169         struct templ *p;
 170
 171         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 172             int         seendot = 0,
 173                         seenexp = 0,
 174                         seensfx = 0;
 175             if (*buf_ptr == '0' &&
 176                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 177                 *e_token++ = *buf_ptr++;
 178                 *e_token++ = *buf_ptr++;
 179                 while (isxdigit(*buf_ptr)) {
 180                     CHECK_SIZE_TOKEN;
 181                     *e_token++ = *buf_ptr++;
 182                 }
 183             }
 184             else
 185                 while (1) {
 186                     if (*buf_ptr == '.') {
 187                         if (seendot)
 188                             break;
 189                         else
 190                             seendot++;
 191                     }
 192                     CHECK_SIZE_TOKEN;
 193                     *e_token++ = *buf_ptr++;
 194                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 195                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 196                             break;
 197                         else {
 198                             seenexp++;
 199                             seendot++;
 200                             CHECK_SIZE_TOKEN;
 201                             *e_token++ = *buf_ptr++;
 202                             if (*buf_ptr == '+' || *buf_ptr == '-')
 203                                 *e_token++ = *buf_ptr++;
 204                         }
 205                     }
 206                 }
 207             while (1) {
 208                 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 209                     CHECK_SIZE_TOKEN;
 210                     *e_token++ = *buf_ptr++;
 211                     seensfx |= 1;
 212                     continue;
 213                 }
 214                 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
 215                     CHECK_SIZE_TOKEN;
 216                     if (buf_ptr[1] == buf_ptr[0])
 217                         *e_token++ = *buf_ptr++;
 218                     *e_token++ = *buf_ptr++;
 219                     seensfx |= 2;
 220                     continue;
 221                 }
 222                 break;
 223             }
 224         }
 225         else
 226             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 227                 /* fill_buffer() terminates buffer with newline */
 228                 if (*buf_ptr == BACKSLASH) {
 229                     if (*(buf_ptr + 1) == '\n') {
 230                         buf_ptr += 2;
 231                         if (buf_ptr >= buf_end)
 232                             fill_buffer();
 233                         } else
 234                             break;
 235                 }
 236                 CHECK_SIZE_TOKEN;
 237                 /* copy it over */
 238                 *e_token++ = *buf_ptr++;
 239                 if (buf_ptr >= buf_end)
 240                     fill_buffer();
 241             }
 242         *e_token++ = '\0';
 243
 244         if (s_token[0] == 'L' && s_token[1] == '\0' &&
 245               (*buf_ptr == '"' || *buf_ptr == '\''))
 246             return (strpfx);
 247
 248         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 249             if (++buf_ptr >= buf_end)
 250                 fill_buffer();
 251         }
 252         ps.keyword = 0;
 253         if (l_struct && !ps.p_l_follow) {
 254                                 /* if last token was 'struct' and we're not
 255                                  * in parentheses, then this token
 256                                  * should be treated as a declaration */
 257             l_struct = false;
 258             last_code = ident;
 259             ps.last_u_d = true;
 260             return (decl);
 261         }
 262         ps.last_u_d = l_struct; /* Operator after identifier is binary
 263                                  * unless last token was 'struct' */
 264         l_struct = false;
 265         last_code = ident;      /* Remember that this is the code we will
 266                                  * return */
 267
 268         p = bsearch(s_token,
 269             specials,
 270             sizeof(specials) / sizeof(specials[0]),
 271             sizeof(specials[0]),
 272             strcmp_type);
 273         if (p == NULL) {        /* not a special keyword... */
 274             char *u;
 275
 276             /* ... so maybe a type_t or a typedef */
 277             if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
 278                 strcmp(u, "_t") == 0) || (typename_top >= 0 &&
 279                   bsearch(s_token, typenames, typename_top + 1,
 280                     sizeof(typenames[0]), strcmp_type))) {
 281                 ps.keyword = 4; /* a type name */
 282                 ps.last_u_d = true;
 283                 goto found_typename;
 284             }
 285         } else {                        /* we have a keyword */
 286             ps.keyword = p->rwcode;
 287             ps.last_u_d = true;
 288             switch (p->rwcode) {
 289             case 7:             /* it is a switch */
 290                 return (swstmt);
 291             case 8:             /* a case or default */
 292                 return (casestmt);
 293
 294             case 3:             /* a "struct" */
 295                 /*
 296                  * Next time around, we will want to know that we have had a
 297                  * 'struct'
 298                  */
 299                 l_struct = true;
 300                 /* FALLTHROUGH */
 301
 302             case 4:             /* one of the declaration keywords */
 303             found_typename:
 304                 if (ps.p_l_follow) {
 305                     /* inside parens: cast, param list, offsetof or sizeof */
 306                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
 307                     break;
 308                 }
 309                 last_code = decl;
 310                 return (decl);
 311
 312             case 5:             /* if, while, for */
 313                 return (sp_paren);
 314
 315             case 6:             /* do, else */
 316                 return (sp_nparen);
 317
 318             case 10:            /* storage class specifier */
 319                 return (storage);
 320
 321             default:            /* all others are treated like any other
 322                                  * identifier */
 323                 return (ident);
 324             }                   /* end of switch */
 325         }                       /* end of if (found_it) */
 326         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 327             char *tp = buf_ptr;
 328             while (tp < buf_end)
 329                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 330                     goto not_proc;
 331             strncpy(ps.procname, token, sizeof ps.procname - 1);
 332             if (ps.in_decl)
 333                 ps.in_parameter_declaration = 1;
 334             rparen_count = 1;
 335     not_proc:;
 336         }
 337         /*
 338          * The following hack attempts to guess whether or not the current
 339          * token is in fact a declaration keyword -- one that has been
 340          * typedefd
 341          */
 342         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 343                 && !ps.p_l_follow
 344                 && !ps.block_init
 345                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 346                     ps.last_token == decl ||
 347                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 348             ps.keyword = 4;     /* a type name */
 349             ps.last_u_d = true;
 350             last_code = decl;
 351             return decl;
 352         }
 353         if (last_code == decl)  /* if this is a declared variable, then
 354                                  * following sign is unary */
 355             ps.last_u_d = true; /* will make "int a -1" work */
 356         last_code = ident;
 357         return (ident);         /* the ident is not in the list */
 358     }                           /* end of procesing for alpanum character */
 359
 360     /* Scan a non-alphanumeric token */
 361
 362     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 363                                  * moved here */
 364     *e_token = '\0';
 365     if (++buf_ptr >= buf_end)
 366         fill_buffer();
 367
 368     switch (*token) {
 369     case '\n':
 370         unary_delim = ps.last_u_d;
 371         ps.last_nl = true;      /* remember that we just had a newline */
 372         code = (had_eof ? 0 : newline);
 373
 374         /*
 375          * if data has been exhausted, the newline is a dummy, and we should
 376          * return code to stop
 377          */
 378         break;
 379
 380     case '\'':                  /* start of quoted character */
 381     case '"':                   /* start of string */
 382         qchar = *token;
 383         if (troff) {
 384             e_token[-1] = '`';
 385             if (qchar == '"')
 386                 *e_token++ = '`';
 387             e_token = chfont(&bodyf, &stringf, e_token);
 388         }
 389         do {                    /* copy the string */
 390             while (1) {         /* move one character or [/<char>]<char> */
 391                 if (*buf_ptr == '\n') {
 392                     diag2(1, "Unterminated literal");
 393                     goto stop_lit;
 394                 }
 395                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 396                                          * since CHECK_SIZE guarantees that there
 397                                          * are at least 5 entries left */
 398                 *e_token = *buf_ptr++;
 399                 if (buf_ptr >= buf_end)
 400                     fill_buffer();
 401                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 402                     if (*buf_ptr == '\n')       /* check for escaped newline */
 403                         ++line_no;
 404                     if (troff) {
 405                         *++e_token = BACKSLASH;
 406                         if (*buf_ptr == BACKSLASH)
 407                             *++e_token = BACKSLASH;
 408                     }
 409                     *++e_token = *buf_ptr++;
 410                     ++e_token;  /* we must increment this again because we
 411                                  * copied two chars */
 412                     if (buf_ptr >= buf_end)
 413                         fill_buffer();
 414                 }
 415                 else
 416                     break;      /* we copied one character */
 417             }                   /* end of while (1) */
 418         } while (*e_token++ != qchar);
 419         if (troff) {
 420             e_token = chfont(&stringf, &bodyf, e_token - 1);
 421             if (qchar == '"')
 422                 *e_token++ = '\'';
 423         }
 424 stop_lit:
 425         code = ident;
 426         break;
 427
 428     case ('('):
 429     case ('['):
 430         unary_delim = true;
 431         code = lparen;
 432         break;
 433
 434     case (')'):
 435     case (']'):
 436         code = rparen;
 437         break;
 438
 439     case '#':
 440         unary_delim = ps.last_u_d;
 441         code = preesc;
 442         break;
 443
 444     case '?':
 445         unary_delim = true;
 446         code = question;
 447         break;
 448
 449     case (':'):
 450         code = colon;
 451         unary_delim = true;
 452         break;
 453
 454     case (';'):
 455         unary_delim = true;
 456         code = semicolon;
 457         break;
 458
 459     case ('{'):
 460         unary_delim = true;
 461
 462         /*
 463          * if (ps.in_or_st) ps.block_init = 1;
 464          */
 465         /* ?    code = ps.block_init ? lparen : lbrace; */
 466         code = lbrace;
 467         break;
 468
 469     case ('}'):
 470         unary_delim = true;
 471         /* ?    code = ps.block_init ? rparen : rbrace; */
 472         code = rbrace;
 473         break;
 474
 475     case 014:                   /* a form feed */
 476         unary_delim = ps.last_u_d;
 477         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 478                                  * right */
 479         code = form_feed;
 480         break;
 481
 482     case (','):
 483         unary_delim = true;
 484         code = comma;
 485         break;
 486
 487     case '.':
 488         unary_delim = false;
 489         code = period;
 490         break;
 491
 492     case '-':
 493     case '+':                   /* check for -, +, --, ++ */
 494         code = (ps.last_u_d ? unary_op : binary_op);
 495         unary_delim = true;
 496
 497         if (*buf_ptr == token[0]) {
 498             /* check for doubled character */
 499             *e_token++ = *buf_ptr++;
 500             /* buffer overflow will be checked at end of loop */
 501             if (last_code == ident || last_code == rparen) {
 502                 code = (ps.last_u_d ? unary_op : postop);
 503                 /* check for following ++ or -- */
 504                 unary_delim = false;
 505             }
 506         }
 507         else if (*buf_ptr == '=')
 508             /* check for operator += */
 509             *e_token++ = *buf_ptr++;
 510         else if (*buf_ptr == '>') {
 511             /* check for operator -> */
 512             *e_token++ = *buf_ptr++;
 513             if (!pointer_as_binop) {
 514                 unary_delim = false;
 515                 code = unary_op;
 516                 ps.want_blank = false;
 517             }
 518         }
 519         break;                  /* buffer overflow will be checked at end of
 520                                  * switch */
 521
 522     case '=':
 523         if (ps.in_or_st)
 524             ps.block_init = 1;
 525 #ifdef undef
 526         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 527             e_token[-1] = *buf_ptr++;
 528             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 529                 *e_token++ = *buf_ptr++;
 530             *e_token++ = '=';   /* Flip =+ to += */
 531             *e_token = 0;
 532         }
 533 #else
 534         if (*buf_ptr == '=') {/* == */
 535             *e_token++ = '=';   /* Flip =+ to += */
 536             buf_ptr++;
 537             *e_token = 0;
 538         }
 539 #endif
 540         code = binary_op;
 541         unary_delim = true;
 542         break;
 543         /* can drop thru!!! */
 544
 545     case '>':
 546     case '<':
 547     case '!':                   /* ops like <, <<, <=, !=, etc */
 548         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 549             *e_token++ = *buf_ptr;
 550             if (++buf_ptr >= buf_end)
 551                 fill_buffer();
 552         }
 553         if (*buf_ptr == '=')
 554             *e_token++ = *buf_ptr++;
 555         code = (ps.last_u_d ? unary_op : binary_op);
 556         unary_delim = true;
 557         break;
 558
 559     default:
 560         if (token[0] == '/' && *buf_ptr == '*') {
 561             /* it is start of comment */
 562             *e_token++ = '*';
 563
 564             if (++buf_ptr >= buf_end)
 565                 fill_buffer();
 566
 567             code = comment;
 568             unary_delim = ps.last_u_d;
 569             break;
 570         }
 571         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 572             /*
 573              * handle ||, &&, etc, and also things as in int *****i
 574              */
 575             *e_token++ = *buf_ptr;
 576             if (++buf_ptr >= buf_end)
 577                 fill_buffer();
 578         }
 579         code = (ps.last_u_d ? unary_op : binary_op);
 580         unary_delim = true;
 581
 582
 583     }                           /* end of switch */
 584     if (code != newline) {
 585         l_struct = false;
 586         last_code = code;
 587     }
 588     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 589         fill_buffer();
 590     ps.last_u_d = unary_delim;
 591     *e_token = '\0';            /* null terminate the token */
 592     return (code);
 593 }
 594
 595 void
 596 alloc_typenames(void)
 597 {
 598
 599     typenames = (const char **)malloc(sizeof(typenames[0]) *
 600         (typename_count = 16));
 601     if (typenames == NULL)
 602         err(1, NULL);
 603 }
 604
 605 void
 606 add_typename(const char *key)
 607 {
 608     int comparison;
 609     const char *copy;
 610
 611     if (typename_top + 1 >= typename_count) {
 612         typenames = realloc((void *)typenames,
 613             sizeof(typenames[0]) * (typename_count *= 2));
 614         if (typenames == NULL)
 615             err(1, NULL);
 616     }
 617     if (typename_top == -1)
 618         typenames[++typename_top] = copy = strdup(key);
 619     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
 620         /* take advantage of sorted input */
 621         if (comparison == 0)    /* remove duplicates */
 622             return;
 623         typenames[++typename_top] = copy = strdup(key);
 624     }
 625     else {
 626         int p;
 627
 628         for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
 629             /* find place for the new key */;
 630         if (comparison == 0)    /* remove duplicates */
 631             return;
 632         memmove(&typenames[p + 1], &typenames[p],
 633             sizeof(typenames[0]) * (++typename_top - p));
 634         typenames[p] = copy = strdup(key);
 635     }
 636
 637     if (copy == NULL)
 638         err(1, NULL);
 639 }