usr.bin/indent/lexi.c

   1 /*-
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. All advertising materials mentioning features or use of this software
  16  *    must display the following acknowledgement:
  17  *      This product includes software developed by the University of
  18  *      California, Berkeley and its contributors.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35
  36 #if 0
  37 #ifndef lint
  38 static char sccsid[] = "@(#)lexi.c      8.1 (Berkeley) 6/6/93";
  39 #endif /* not lint */
  40 #endif
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 /*
  45  * Here we have the token scanner for indent.  It scans off one token and puts
  46  * it in the global variable "token".  It returns a code, indicating the type
  47  * of token scanned.
  48  */
  49
  50 #include <err.h>
  51 #include <stdio.h>
  52 #include <ctype.h>
  53 #include <stdlib.h>
  54 #include <string.h>
  55 #include "indent_globs.h"
  56 #include "indent_codes.h"
  57 #include "indent.h"
  58
  59 #define alphanum 1
  60 #define opchar 3
  61
  62 struct templ {
  63     const char *rwd;
  64     int         rwcode;
  65 };
  66
  67 /*
  68  * This table has to be sorted alphabetically, because it'll be used in binary
  69  * search. For the same reason, string must be the first thing in struct templ.
  70  */
  71 struct templ specials[] =
  72 {
  73     {"break", 9},
  74     {"case", 8},
  75     {"char", 4},
  76     {"const", 4},
  77     {"default", 8},
  78     {"do", 6},
  79     {"double", 4},
  80     {"else", 6},
  81     {"enum", 3},
  82     {"extern", 4},
  83     {"float", 4},
  84     {"for", 5},
  85     {"global", 4},
  86     {"goto", 9},
  87     {"if", 5},
  88     {"int", 4},
  89     {"long", 4},
  90     {"offsetof", 1},
  91     {"register", 4},
  92     {"return", 9},
  93     {"short", 4},
  94     {"sizeof", 2},
  95     {"static", 4},
  96     {"struct", 3},
  97     {"switch", 7},
  98     {"typedef", 4},
  99     {"union", 3},
 100     {"unsigned", 4},
 101     {"void", 4},
 102     {"volatile", 4},
 103     {"while", 5}
 104 };
 105
 106 const char **typenames;
 107 int         typename_count;
 108 int         typename_top = -1;
 109
 110 char        chartype[128] =
 111 {                               /* this is used to facilitate the decision of
 112                                  * what type (alphanumeric, operator) each
 113                                  * character is */
 114     0, 0, 0, 0, 0, 0, 0, 0,
 115     0, 0, 0, 0, 0, 0, 0, 0,
 116     0, 0, 0, 0, 0, 0, 0, 0,
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118     0, 3, 0, 0, 1, 3, 3, 0,
 119     0, 0, 3, 3, 0, 3, 0, 3,
 120     1, 1, 1, 1, 1, 1, 1, 1,
 121     1, 1, 0, 0, 3, 3, 3, 3,
 122     0, 1, 1, 1, 1, 1, 1, 1,
 123     1, 1, 1, 1, 1, 1, 1, 1,
 124     1, 1, 1, 1, 1, 1, 1, 1,
 125     1, 1, 1, 0, 0, 0, 3, 1,
 126     0, 1, 1, 1, 1, 1, 1, 1,
 127     1, 1, 1, 1, 1, 1, 1, 1,
 128     1, 1, 1, 1, 1, 1, 1, 1,
 129     1, 1, 1, 0, 3, 0, 3, 0
 130 };
 131
 132 static int
 133 strcmp_type(const void *e1, const void *e2)
 134 {
 135     return (strcmp(e1, *(const char * const *)e2));
 136 }
 137
 138 int
 139 lexi(void)
 140 {
 141     int         unary_delim;    /* this is set to 1 if the current token
 142                                  * forces a following operator to be unary */
 143     static int  last_code;      /* the last token type returned */
 144     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 145     int         code;           /* internal code to be returned */
 146     char        qchar;          /* the delimiter character for a string */
 147
 148     e_token = s_token;          /* point to start of place to save token */
 149     unary_delim = false;
 150     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 151                                  * column 1 iff the last thing scanned was nl */
 152     ps.last_nl = false;
 153
 154     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 155         ps.col_1 = false;       /* leading blanks imply token is not in column
 156                                  * 1 */
 157         if (++buf_ptr >= buf_end)
 158             fill_buffer();
 159     }
 160
 161     /* Scan an alphanumeric token */
 162     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 163         /*
 164          * we have a character or number
 165          */
 166         struct templ *p;
 167
 168         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 169             int         seendot = 0,
 170                         seenexp = 0,
 171                         seensfx = 0;
 172             if (*buf_ptr == '0' &&
 173                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 174                 *e_token++ = *buf_ptr++;
 175                 *e_token++ = *buf_ptr++;
 176                 while (isxdigit(*buf_ptr)) {
 177                     CHECK_SIZE_TOKEN;
 178                     *e_token++ = *buf_ptr++;
 179                 }
 180             }
 181             else
 182                 while (1) {
 183                     if (*buf_ptr == '.') {
 184                         if (seendot)
 185                             break;
 186                         else
 187                             seendot++;
 188                     }
 189                     CHECK_SIZE_TOKEN;
 190                     *e_token++ = *buf_ptr++;
 191                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 192                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 193                             break;
 194                         else {
 195                             seenexp++;
 196                             seendot++;
 197                             CHECK_SIZE_TOKEN;
 198                             *e_token++ = *buf_ptr++;
 199                             if (*buf_ptr == '+' || *buf_ptr == '-')
 200                                 *e_token++ = *buf_ptr++;
 201                         }
 202                     }
 203                 }
 204             while (1) {
 205                 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 206                     CHECK_SIZE_TOKEN;
 207                     *e_token++ = *buf_ptr++;
 208                     seensfx |= 1;
 209                     continue;
 210                 }
 211                 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
 212                     CHECK_SIZE_TOKEN;
 213                     if (buf_ptr[1] == buf_ptr[0])
 214                         *e_token++ = *buf_ptr++;
 215                     *e_token++ = *buf_ptr++;
 216                     seensfx |= 2;
 217                     continue;
 218                 }
 219                 break;
 220             }
 221         }
 222         else
 223             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 224                 /* fill_buffer() terminates buffer with newline */
 225                 if (*buf_ptr == BACKSLASH) {
 226                     if (*(buf_ptr + 1) == '\n') {
 227                         buf_ptr += 2;
 228                         if (buf_ptr >= buf_end)
 229                             fill_buffer();
 230                         } else
 231                             break;
 232                 }
 233                 CHECK_SIZE_TOKEN;
 234                 /* copy it over */
 235                 *e_token++ = *buf_ptr++;
 236                 if (buf_ptr >= buf_end)
 237                     fill_buffer();
 238             }
 239         *e_token++ = '\0';
 240         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 241             if (++buf_ptr >= buf_end)
 242                 fill_buffer();
 243         }
 244         ps.keyword = 0;
 245         if (l_struct && !ps.p_l_follow) {
 246                                 /* if last token was 'struct' and we're not
 247                                  * in parentheses, then this token
 248                                  * should be treated as a declaration */
 249             l_struct = false;
 250             last_code = ident;
 251             ps.last_u_d = true;
 252             return (decl);
 253         }
 254         ps.last_u_d = l_struct; /* Operator after identifier is binary
 255                                  * unless last token was 'struct' */
 256         l_struct = false;
 257         last_code = ident;      /* Remember that this is the code we will
 258                                  * return */
 259
 260         p = bsearch(s_token,
 261             specials,
 262             sizeof(specials) / sizeof(specials[0]),
 263             sizeof(specials[0]),
 264             strcmp_type);
 265         if (p == NULL) {        /* not a special keyword... */
 266             char *u;
 267
 268             /* ... so maybe a type_t or a typedef */
 269             if ((auto_typedefs && ((u = strrchr(s_token, '_')) != NULL) &&
 270                 strcmp(u, "_t") == 0) || (typename_top >= 0 &&
 271                   bsearch(s_token, typenames, typename_top + 1,
 272                     sizeof(typenames[0]), strcmp_type))) {
 273                 ps.keyword = 4; /* a type name */
 274                 ps.last_u_d = true;
 275                 goto found_typename;
 276             }
 277         } else {                        /* we have a keyword */
 278             ps.keyword = p->rwcode;
 279             ps.last_u_d = true;
 280             switch (p->rwcode) {
 281             case 7:             /* it is a switch */
 282                 return (swstmt);
 283             case 8:             /* a case or default */
 284                 return (casestmt);
 285
 286             case 3:             /* a "struct" */
 287                 /*
 288                  * Next time around, we will want to know that we have had a
 289                  * 'struct'
 290                  */
 291                 l_struct = true;
 292                 /* FALLTHROUGH */
 293
 294             case 4:             /* one of the declaration keywords */
 295             found_typename:
 296                 if (ps.p_l_follow) {
 297                     /* inside parens: cast, param list, offsetof or sizeof */
 298                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
 299                     break;
 300                 }
 301                 last_code = decl;
 302                 return (decl);
 303
 304             case 5:             /* if, while, for */
 305                 return (sp_paren);
 306
 307             case 6:             /* do, else */
 308                 return (sp_nparen);
 309
 310             default:            /* all others are treated like any other
 311                                  * identifier */
 312                 return (ident);
 313             }                   /* end of switch */
 314         }                       /* end of if (found_it) */
 315         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 316             char *tp = buf_ptr;
 317             while (tp < buf_end)
 318                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 319                     goto not_proc;
 320             strncpy(ps.procname, token, sizeof ps.procname - 1);
 321             ps.in_parameter_declaration = 1;
 322             rparen_count = 1;
 323     not_proc:;
 324         }
 325         /*
 326          * The following hack attempts to guess whether or not the current
 327          * token is in fact a declaration keyword -- one that has been
 328          * typedefd
 329          */
 330         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 331                 && !ps.p_l_follow
 332                 && !ps.block_init
 333                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 334                     ps.last_token == decl ||
 335                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 336             ps.keyword = 4;     /* a type name */
 337             ps.last_u_d = true;
 338             last_code = decl;
 339             return decl;
 340         }
 341         if (last_code == decl)  /* if this is a declared variable, then
 342                                  * following sign is unary */
 343             ps.last_u_d = true; /* will make "int a -1" work */
 344         last_code = ident;
 345         return (ident);         /* the ident is not in the list */
 346     }                           /* end of procesing for alpanum character */
 347
 348     /* Scan a non-alphanumeric token */
 349
 350     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 351                                  * moved here */
 352     *e_token = '\0';
 353     if (++buf_ptr >= buf_end)
 354         fill_buffer();
 355
 356     switch (*token) {
 357     case '\n':
 358         unary_delim = ps.last_u_d;
 359         ps.last_nl = true;      /* remember that we just had a newline */
 360         code = (had_eof ? 0 : newline);
 361
 362         /*
 363          * if data has been exhausted, the newline is a dummy, and we should
 364          * return code to stop
 365          */
 366         break;
 367
 368     case '\'':                  /* start of quoted character */
 369     case '"':                   /* start of string */
 370         qchar = *token;
 371         if (troff) {
 372             e_token[-1] = '`';
 373             if (qchar == '"')
 374                 *e_token++ = '`';
 375             e_token = chfont(&bodyf, &stringf, e_token);
 376         }
 377         do {                    /* copy the string */
 378             while (1) {         /* move one character or [/<char>]<char> */
 379                 if (*buf_ptr == '\n') {
 380                     diag2(1, "Unterminated literal");
 381                     goto stop_lit;
 382                 }
 383                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 384                                          * since CHECK_SIZE guarantees that there
 385                                          * are at least 5 entries left */
 386                 *e_token = *buf_ptr++;
 387                 if (buf_ptr >= buf_end)
 388                     fill_buffer();
 389                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 390                     if (*buf_ptr == '\n')       /* check for escaped newline */
 391                         ++line_no;
 392                     if (troff) {
 393                         *++e_token = BACKSLASH;
 394                         if (*buf_ptr == BACKSLASH)
 395                             *++e_token = BACKSLASH;
 396                     }
 397                     *++e_token = *buf_ptr++;
 398                     ++e_token;  /* we must increment this again because we
 399                                  * copied two chars */
 400                     if (buf_ptr >= buf_end)
 401                         fill_buffer();
 402                 }
 403                 else
 404                     break;      /* we copied one character */
 405             }                   /* end of while (1) */
 406         } while (*e_token++ != qchar);
 407         if (troff) {
 408             e_token = chfont(&stringf, &bodyf, e_token - 1);
 409             if (qchar == '"')
 410                 *e_token++ = '\'';
 411         }
 412 stop_lit:
 413         code = ident;
 414         break;
 415
 416     case ('('):
 417     case ('['):
 418         unary_delim = true;
 419         code = lparen;
 420         break;
 421
 422     case (')'):
 423     case (']'):
 424         code = rparen;
 425         break;
 426
 427     case '#':
 428         unary_delim = ps.last_u_d;
 429         code = preesc;
 430         break;
 431
 432     case '?':
 433         unary_delim = true;
 434         code = question;
 435         break;
 436
 437     case (':'):
 438         code = colon;
 439         unary_delim = true;
 440         break;
 441
 442     case (';'):
 443         unary_delim = true;
 444         code = semicolon;
 445         break;
 446
 447     case ('{'):
 448         unary_delim = true;
 449
 450         /*
 451          * if (ps.in_or_st) ps.block_init = 1;
 452          */
 453         /* ?    code = ps.block_init ? lparen : lbrace; */
 454         code = lbrace;
 455         break;
 456
 457     case ('}'):
 458         unary_delim = true;
 459         /* ?    code = ps.block_init ? rparen : rbrace; */
 460         code = rbrace;
 461         break;
 462
 463     case 014:                   /* a form feed */
 464         unary_delim = ps.last_u_d;
 465         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 466                                  * right */
 467         code = form_feed;
 468         break;
 469
 470     case (','):
 471         unary_delim = true;
 472         code = comma;
 473         break;
 474
 475     case '.':
 476         unary_delim = false;
 477         code = period;
 478         break;
 479
 480     case '-':
 481     case '+':                   /* check for -, +, --, ++ */
 482         code = (ps.last_u_d ? unary_op : binary_op);
 483         unary_delim = true;
 484
 485         if (*buf_ptr == token[0]) {
 486             /* check for doubled character */
 487             *e_token++ = *buf_ptr++;
 488             /* buffer overflow will be checked at end of loop */
 489             if (last_code == ident || last_code == rparen) {
 490                 code = (ps.last_u_d ? unary_op : postop);
 491                 /* check for following ++ or -- */
 492                 unary_delim = false;
 493             }
 494         }
 495         else if (*buf_ptr == '=')
 496             /* check for operator += */
 497             *e_token++ = *buf_ptr++;
 498         else if (*buf_ptr == '>') {
 499             /* check for operator -> */
 500             *e_token++ = *buf_ptr++;
 501             if (!pointer_as_binop) {
 502                 unary_delim = false;
 503                 code = unary_op;
 504                 ps.want_blank = false;
 505             }
 506         }
 507         break;                  /* buffer overflow will be checked at end of
 508                                  * switch */
 509
 510     case '=':
 511         if (ps.in_or_st)
 512             ps.block_init = 1;
 513 #ifdef undef
 514         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 515             e_token[-1] = *buf_ptr++;
 516             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 517                 *e_token++ = *buf_ptr++;
 518             *e_token++ = '=';   /* Flip =+ to += */
 519             *e_token = 0;
 520         }
 521 #else
 522         if (*buf_ptr == '=') {/* == */
 523             *e_token++ = '=';   /* Flip =+ to += */
 524             buf_ptr++;
 525             *e_token = 0;
 526         }
 527 #endif
 528         code = binary_op;
 529         unary_delim = true;
 530         break;
 531         /* can drop thru!!! */
 532
 533     case '>':
 534     case '<':
 535     case '!':                   /* ops like <, <<, <=, !=, etc */
 536         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 537             *e_token++ = *buf_ptr;
 538             if (++buf_ptr >= buf_end)
 539                 fill_buffer();
 540         }
 541         if (*buf_ptr == '=')
 542             *e_token++ = *buf_ptr++;
 543         code = (ps.last_u_d ? unary_op : binary_op);
 544         unary_delim = true;
 545         break;
 546
 547     default:
 548         if (token[0] == '/' && *buf_ptr == '*') {
 549             /* it is start of comment */
 550             *e_token++ = '*';
 551
 552             if (++buf_ptr >= buf_end)
 553                 fill_buffer();
 554
 555             code = comment;
 556             unary_delim = ps.last_u_d;
 557             break;
 558         }
 559         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 560             /*
 561              * handle ||, &&, etc, and also things as in int *****i
 562              */
 563             *e_token++ = *buf_ptr;
 564             if (++buf_ptr >= buf_end)
 565                 fill_buffer();
 566         }
 567         code = (ps.last_u_d ? unary_op : binary_op);
 568         unary_delim = true;
 569
 570
 571     }                           /* end of switch */
 572     if (code != newline) {
 573         l_struct = false;
 574         last_code = code;
 575     }
 576     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 577         fill_buffer();
 578     ps.last_u_d = unary_delim;
 579     *e_token = '\0';            /* null terminate the token */
 580     return (code);
 581 }
 582
 583 void
 584 alloc_typenames(void)
 585 {
 586
 587     typenames = (const char **)malloc(sizeof(typenames[0]) *
 588         (typename_count = 16));
 589     if (typenames == NULL)
 590         err(1, NULL);
 591 }
 592
 593 void
 594 add_typename(const char *key)
 595 {
 596     int comparison;
 597     const char *copy;
 598
 599     if (typename_top + 1 >= typename_count) {
 600         typenames = realloc((void *)typenames,
 601             sizeof(typenames[0]) * (typename_count *= 2));
 602         if (typenames == NULL)
 603             err(1, NULL);
 604     }
 605     if (typename_top == -1)
 606         typenames[++typename_top] = copy = strdup(key);
 607     else if ((comparison = strcmp(key, typenames[typename_top])) >= 0) {
 608         /* take advantage of sorted input */
 609         if (comparison == 0)    /* remove duplicates */
 610             return;
 611         typenames[++typename_top] = copy = strdup(key);
 612     }
 613     else {
 614         int p;
 615
 616         for (p = 0; (comparison = strcmp(key, typenames[p])) > 0; p++)
 617             /* find place for the new key */;
 618         if (comparison == 0)    /* remove duplicates */
 619             return;
 620         memmove(&typenames[p + 1], &typenames[p],
 621             sizeof(typenames[0]) * (++typename_top - p));
 622         typenames[p] = copy = strdup(key);
 623     }
 624
 625     if (copy == NULL)
 626         err(1, NULL);
 627 }