usr.bin/indent/lexi.c

   1 /*-
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. All advertising materials mentioning features or use of this software
  16  *    must display the following acknowledgement:
  17  *      This product includes software developed by the University of
  18  *      California, Berkeley and its contributors.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35
  36 #if 0
  37 #ifndef lint
  38 static char sccsid[] = "@(#)lexi.c      8.1 (Berkeley) 6/6/93";
  39 #endif /* not lint */
  40 #endif
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 /*
  45  * Here we have the token scanner for indent.  It scans off one token and puts
  46  * it in the global variable "token".  It returns a code, indicating the type
  47  * of token scanned.
  48  */
  49
  50 #include <err.h>
  51 #include <stdio.h>
  52 #include <ctype.h>
  53 #include <stdlib.h>
  54 #include <string.h>
  55 #include "indent_globs.h"
  56 #include "indent_codes.h"
  57 #include "indent.h"
  58
  59 #define alphanum 1
  60 #define opchar 3
  61
  62 struct templ {
  63     const char *rwd;
  64     int         rwcode;
  65 };
  66
  67 struct templ specials[16384] =
  68 {
  69     {"switch", 7},
  70     {"case", 8},
  71     {"break", 9},
  72     {"struct", 3},
  73     {"union", 3},
  74     {"enum", 3},
  75     {"default", 8},
  76     {"int", 4},
  77     {"char", 4},
  78     {"float", 4},
  79     {"double", 4},
  80     {"long", 4},
  81     {"short", 4},
  82     {"typedef", 4},
  83     {"unsigned", 4},
  84     {"register", 4},
  85     {"static", 4},
  86     {"global", 4},
  87     {"extern", 4},
  88     {"void", 4},
  89     {"const", 4},
  90     {"volatile", 4},
  91     {"goto", 9},
  92     {"return", 9},
  93     {"if", 5},
  94     {"while", 5},
  95     {"for", 5},
  96     {"else", 6},
  97     {"do", 6},
  98     {"sizeof", 2},
  99     {"offsetof", 1},
 100     {0, 0}
 101 };
 102
 103 char        chartype[128] =
 104 {                               /* this is used to facilitate the decision of
 105                                  * what type (alphanumeric, operator) each
 106                                  * character is */
 107     0, 0, 0, 0, 0, 0, 0, 0,
 108     0, 0, 0, 0, 0, 0, 0, 0,
 109     0, 0, 0, 0, 0, 0, 0, 0,
 110     0, 0, 0, 0, 0, 0, 0, 0,
 111     0, 3, 0, 0, 1, 3, 3, 0,
 112     0, 0, 3, 3, 0, 3, 0, 3,
 113     1, 1, 1, 1, 1, 1, 1, 1,
 114     1, 1, 0, 0, 3, 3, 3, 3,
 115     0, 1, 1, 1, 1, 1, 1, 1,
 116     1, 1, 1, 1, 1, 1, 1, 1,
 117     1, 1, 1, 1, 1, 1, 1, 1,
 118     1, 1, 1, 0, 0, 0, 3, 1,
 119     0, 1, 1, 1, 1, 1, 1, 1,
 120     1, 1, 1, 1, 1, 1, 1, 1,
 121     1, 1, 1, 1, 1, 1, 1, 1,
 122     1, 1, 1, 0, 3, 0, 3, 0
 123 };
 124
 125 int
 126 lexi(void)
 127 {
 128     int         unary_delim;    /* this is set to 1 if the current token
 129                                  * forces a following operator to be unary */
 130     static int  last_code;      /* the last token type returned */
 131     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 132     int         code;           /* internal code to be returned */
 133     char        qchar;          /* the delimiter character for a string */
 134
 135     e_token = s_token;          /* point to start of place to save token */
 136     unary_delim = false;
 137     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 138                                  * column 1 iff the last thing scanned was nl */
 139     ps.last_nl = false;
 140
 141     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 142         ps.col_1 = false;       /* leading blanks imply token is not in column
 143                                  * 1 */
 144         if (++buf_ptr >= buf_end)
 145             fill_buffer();
 146     }
 147
 148     /* Scan an alphanumeric token */
 149     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 150         /*
 151          * we have a character or number
 152          */
 153         const char *j;          /* used for searching thru list of
 154                                  *
 155                                  * reserved words */
 156         struct templ *p;
 157
 158         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 159             int         seendot = 0,
 160                         seenexp = 0,
 161                         seensfx = 0;
 162             if (*buf_ptr == '0' &&
 163                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 164                 *e_token++ = *buf_ptr++;
 165                 *e_token++ = *buf_ptr++;
 166                 while (isxdigit(*buf_ptr)) {
 167                     CHECK_SIZE_TOKEN;
 168                     *e_token++ = *buf_ptr++;
 169                 }
 170             }
 171             else
 172                 while (1) {
 173                     if (*buf_ptr == '.') {
 174                         if (seendot)
 175                             break;
 176                         else
 177                             seendot++;
 178                     }
 179                     CHECK_SIZE_TOKEN;
 180                     *e_token++ = *buf_ptr++;
 181                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 182                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 183                             break;
 184                         else {
 185                             seenexp++;
 186                             seendot++;
 187                             CHECK_SIZE_TOKEN;
 188                             *e_token++ = *buf_ptr++;
 189                             if (*buf_ptr == '+' || *buf_ptr == '-')
 190                                 *e_token++ = *buf_ptr++;
 191                         }
 192                     }
 193                 }
 194             while (1) {
 195                 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 196                     CHECK_SIZE_TOKEN;
 197                     *e_token++ = *buf_ptr++;
 198                     seensfx |= 1;
 199                     continue;
 200                 }
 201                 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
 202                     CHECK_SIZE_TOKEN;
 203                     if (buf_ptr[1] == buf_ptr[0])
 204                         *e_token++ = *buf_ptr++;
 205                     *e_token++ = *buf_ptr++;
 206                     seensfx |= 2;
 207                     continue;
 208                 }
 209                 break;
 210             }
 211         }
 212         else
 213             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 214                 /* fill_buffer() terminates buffer with newline */
 215                 if (*buf_ptr == BACKSLASH) {
 216                     if (*(buf_ptr + 1) == '\n') {
 217                         buf_ptr += 2;
 218                         if (buf_ptr >= buf_end)
 219                             fill_buffer();
 220                         } else
 221                             break;
 222                 }
 223                 CHECK_SIZE_TOKEN;
 224                 /* copy it over */
 225                 *e_token++ = *buf_ptr++;
 226                 if (buf_ptr >= buf_end)
 227                     fill_buffer();
 228             }
 229         *e_token++ = '\0';
 230         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 231             if (++buf_ptr >= buf_end)
 232                 fill_buffer();
 233         }
 234         ps.keyword = 0;
 235         if (l_struct && !ps.p_l_follow) {
 236                                 /* if last token was 'struct' and we're not
 237                                  * in parentheses, then this token
 238                                  * should be treated as a declaration */
 239             l_struct = false;
 240             last_code = ident;
 241             ps.last_u_d = true;
 242             return (decl);
 243         }
 244         ps.last_u_d = l_struct; /* Operator after identifier is binary
 245                                  * unless last token was 'struct' */
 246         l_struct = false;
 247         last_code = ident;      /* Remember that this is the code we will
 248                                  * return */
 249
 250         if (auto_typedefs) {
 251             const char *q = s_token;
 252             size_t q_len = strlen(q);
 253             /* Check if we have an "_t" in the end */
 254             if (q_len > 2 &&
 255                 (strcmp(q + q_len - 2, "_t") == 0)) {
 256                 ps.keyword = 4; /* a type name */
 257                 ps.last_u_d = true;
 258                 goto found_auto_typedef;
 259             }
 260         }
 261
 262         /*
 263          * This loop will check if the token is a keyword.
 264          */
 265         for (p = specials; (j = p->rwd) != NULL; p++) {
 266             const char *q = s_token;    /* point at scanned token */
 267             if (*j++ != *q++ || *j++ != *q++)
 268                 continue;       /* This test depends on the fact that
 269                                  * identifiers are always at least 1 character
 270                                  * long (ie. the first two bytes of the
 271                                  * identifier are always meaningful) */
 272             if (q[-1] == 0)
 273                 break;          /* If its a one-character identifier */
 274             while (*q++ == *j)
 275                 if (*j++ == 0)
 276                     goto found_keyword; /* I wish that C had a multi-level
 277                                          * break... */
 278         }
 279         if (p->rwd) {           /* we have a keyword */
 280     found_keyword:
 281             ps.keyword = p->rwcode;
 282             ps.last_u_d = true;
 283             switch (p->rwcode) {
 284             case 7:             /* it is a switch */
 285                 return (swstmt);
 286             case 8:             /* a case or default */
 287                 return (casestmt);
 288
 289             case 3:             /* a "struct" */
 290                 /*
 291                  * Next time around, we will want to know that we have had a
 292                  * 'struct'
 293                  */
 294                 l_struct = true;
 295                 /* FALLTHROUGH */
 296
 297             case 4:             /* one of the declaration keywords */
 298             found_auto_typedef:
 299                 if (ps.p_l_follow) {
 300                     /* inside parens: cast, param list, offsetof or sizeof */
 301                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
 302                     break;
 303                 }
 304                 last_code = decl;
 305                 return (decl);
 306
 307             case 5:             /* if, while, for */
 308                 return (sp_paren);
 309
 310             case 6:             /* do, else */
 311                 return (sp_nparen);
 312
 313             default:            /* all others are treated like any other
 314                                  * identifier */
 315                 return (ident);
 316             }                   /* end of switch */
 317         }                       /* end of if (found_it) */
 318         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 319             char *tp = buf_ptr;
 320             while (tp < buf_end)
 321                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 322                     goto not_proc;
 323             strncpy(ps.procname, token, sizeof ps.procname - 1);
 324             ps.in_parameter_declaration = 1;
 325             rparen_count = 1;
 326     not_proc:;
 327         }
 328         /*
 329          * The following hack attempts to guess whether or not the current
 330          * token is in fact a declaration keyword -- one that has been
 331          * typedefd
 332          */
 333         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 334                 && !ps.p_l_follow
 335                 && !ps.block_init
 336                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 337                     ps.last_token == decl ||
 338                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 339             ps.keyword = 4;     /* a type name */
 340             ps.last_u_d = true;
 341             last_code = decl;
 342             return decl;
 343         }
 344         if (last_code == decl)  /* if this is a declared variable, then
 345                                  * following sign is unary */
 346             ps.last_u_d = true; /* will make "int a -1" work */
 347         last_code = ident;
 348         return (ident);         /* the ident is not in the list */
 349     }                           /* end of procesing for alpanum character */
 350
 351     /* Scan a non-alphanumeric token */
 352
 353     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 354                                  * moved here */
 355     *e_token = '\0';
 356     if (++buf_ptr >= buf_end)
 357         fill_buffer();
 358
 359     switch (*token) {
 360     case '\n':
 361         unary_delim = ps.last_u_d;
 362         ps.last_nl = true;      /* remember that we just had a newline */
 363         code = (had_eof ? 0 : newline);
 364
 365         /*
 366          * if data has been exhausted, the newline is a dummy, and we should
 367          * return code to stop
 368          */
 369         break;
 370
 371     case '\'':                  /* start of quoted character */
 372     case '"':                   /* start of string */
 373         qchar = *token;
 374         if (troff) {
 375             e_token[-1] = '`';
 376             if (qchar == '"')
 377                 *e_token++ = '`';
 378             e_token = chfont(&bodyf, &stringf, e_token);
 379         }
 380         do {                    /* copy the string */
 381             while (1) {         /* move one character or [/<char>]<char> */
 382                 if (*buf_ptr == '\n') {
 383                     diag2(1, "Unterminated literal");
 384                     goto stop_lit;
 385                 }
 386                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 387                                          * since CHECK_SIZE guarantees that there
 388                                          * are at least 5 entries left */
 389                 *e_token = *buf_ptr++;
 390                 if (buf_ptr >= buf_end)
 391                     fill_buffer();
 392                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 393                     if (*buf_ptr == '\n')       /* check for escaped newline */
 394                         ++line_no;
 395                     if (troff) {
 396                         *++e_token = BACKSLASH;
 397                         if (*buf_ptr == BACKSLASH)
 398                             *++e_token = BACKSLASH;
 399                     }
 400                     *++e_token = *buf_ptr++;
 401                     ++e_token;  /* we must increment this again because we
 402                                  * copied two chars */
 403                     if (buf_ptr >= buf_end)
 404                         fill_buffer();
 405                 }
 406                 else
 407                     break;      /* we copied one character */
 408             }                   /* end of while (1) */
 409         } while (*e_token++ != qchar);
 410         if (troff) {
 411             e_token = chfont(&stringf, &bodyf, e_token - 1);
 412             if (qchar == '"')
 413                 *e_token++ = '\'';
 414         }
 415 stop_lit:
 416         code = ident;
 417         break;
 418
 419     case ('('):
 420     case ('['):
 421         unary_delim = true;
 422         code = lparen;
 423         break;
 424
 425     case (')'):
 426     case (']'):
 427         code = rparen;
 428         break;
 429
 430     case '#':
 431         unary_delim = ps.last_u_d;
 432         code = preesc;
 433         break;
 434
 435     case '?':
 436         unary_delim = true;
 437         code = question;
 438         break;
 439
 440     case (':'):
 441         code = colon;
 442         unary_delim = true;
 443         break;
 444
 445     case (';'):
 446         unary_delim = true;
 447         code = semicolon;
 448         break;
 449
 450     case ('{'):
 451         unary_delim = true;
 452
 453         /*
 454          * if (ps.in_or_st) ps.block_init = 1;
 455          */
 456         /* ?    code = ps.block_init ? lparen : lbrace; */
 457         code = lbrace;
 458         break;
 459
 460     case ('}'):
 461         unary_delim = true;
 462         /* ?    code = ps.block_init ? rparen : rbrace; */
 463         code = rbrace;
 464         break;
 465
 466     case 014:                   /* a form feed */
 467         unary_delim = ps.last_u_d;
 468         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 469                                  * right */
 470         code = form_feed;
 471         break;
 472
 473     case (','):
 474         unary_delim = true;
 475         code = comma;
 476         break;
 477
 478     case '.':
 479         unary_delim = false;
 480         code = period;
 481         break;
 482
 483     case '-':
 484     case '+':                   /* check for -, +, --, ++ */
 485         code = (ps.last_u_d ? unary_op : binary_op);
 486         unary_delim = true;
 487
 488         if (*buf_ptr == token[0]) {
 489             /* check for doubled character */
 490             *e_token++ = *buf_ptr++;
 491             /* buffer overflow will be checked at end of loop */
 492             if (last_code == ident || last_code == rparen) {
 493                 code = (ps.last_u_d ? unary_op : postop);
 494                 /* check for following ++ or -- */
 495                 unary_delim = false;
 496             }
 497         }
 498         else if (*buf_ptr == '=')
 499             /* check for operator += */
 500             *e_token++ = *buf_ptr++;
 501         else if (*buf_ptr == '>') {
 502             /* check for operator -> */
 503             *e_token++ = *buf_ptr++;
 504             if (!pointer_as_binop) {
 505                 unary_delim = false;
 506                 code = unary_op;
 507                 ps.want_blank = false;
 508             }
 509         }
 510         break;                  /* buffer overflow will be checked at end of
 511                                  * switch */
 512
 513     case '=':
 514         if (ps.in_or_st)
 515             ps.block_init = 1;
 516 #ifdef undef
 517         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 518             e_token[-1] = *buf_ptr++;
 519             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 520                 *e_token++ = *buf_ptr++;
 521             *e_token++ = '=';   /* Flip =+ to += */
 522             *e_token = 0;
 523         }
 524 #else
 525         if (*buf_ptr == '=') {/* == */
 526             *e_token++ = '=';   /* Flip =+ to += */
 527             buf_ptr++;
 528             *e_token = 0;
 529         }
 530 #endif
 531         code = binary_op;
 532         unary_delim = true;
 533         break;
 534         /* can drop thru!!! */
 535
 536     case '>':
 537     case '<':
 538     case '!':                   /* ops like <, <<, <=, !=, etc */
 539         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 540             *e_token++ = *buf_ptr;
 541             if (++buf_ptr >= buf_end)
 542                 fill_buffer();
 543         }
 544         if (*buf_ptr == '=')
 545             *e_token++ = *buf_ptr++;
 546         code = (ps.last_u_d ? unary_op : binary_op);
 547         unary_delim = true;
 548         break;
 549
 550     default:
 551         if (token[0] == '/' && *buf_ptr == '*') {
 552             /* it is start of comment */
 553             *e_token++ = '*';
 554
 555             if (++buf_ptr >= buf_end)
 556                 fill_buffer();
 557
 558             code = comment;
 559             unary_delim = ps.last_u_d;
 560             break;
 561         }
 562         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 563             /*
 564              * handle ||, &&, etc, and also things as in int *****i
 565              */
 566             *e_token++ = *buf_ptr;
 567             if (++buf_ptr >= buf_end)
 568                 fill_buffer();
 569         }
 570         code = (ps.last_u_d ? unary_op : binary_op);
 571         unary_delim = true;
 572
 573
 574     }                           /* end of switch */
 575     if (code != newline) {
 576         l_struct = false;
 577         last_code = code;
 578     }
 579     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 580         fill_buffer();
 581     ps.last_u_d = unary_delim;
 582     *e_token = '\0';            /* null terminate the token */
 583     return (code);
 584 }
 585
 586 /*
 587  * Add the given keyword to the keyword table, using val as the keyword type
 588  */
 589 void
 590 addkey(char *key, int val)
 591 {
 592     struct templ *p = specials;
 593     while (p->rwd)
 594         if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 595             return;
 596         else
 597             p++;
 598     if (p >= specials + sizeof(specials) / sizeof(specials[0])) {
 599         fprintf(stderr, "indent: typedef table overflow\n");
 600         exit(1);
 601     }
 602     p->rwd = key;
 603     p->rwcode = val;
 604     p[1].rwd = NULL;
 605     p[1].rwcode = 0;
 606 }