usr.bin/indent/lexi.c

   1 /*-
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. All advertising materials mentioning features or use of this software
  16  *    must display the following acknowledgement:
  17  *      This product includes software developed by the University of
  18  *      California, Berkeley and its contributors.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35
  36 #if 0
  37 #ifndef lint
  38 static char sccsid[] = "@(#)lexi.c      8.1 (Berkeley) 6/6/93";
  39 #endif /* not lint */
  40 #endif
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 /*
  45  * Here we have the token scanner for indent.  It scans off one token and puts
  46  * it in the global variable "token".  It returns a code, indicating the type
  47  * of token scanned.
  48  */
  49
  50 #include <err.h>
  51 #include <stdio.h>
  52 #include <ctype.h>
  53 #include <stdlib.h>
  54 #include <string.h>
  55 #include "indent_globs.h"
  56 #include "indent_codes.h"
  57 #include "indent.h"
  58
  59 #define alphanum 1
  60 #ifdef undef
  61 #define opchar 3
  62 #endif
  63
  64 struct templ {
  65     const char *rwd;
  66     int         rwcode;
  67 };
  68
  69 struct templ specials[1000] =
  70 {
  71     {"switch", 7},
  72     {"case", 8},
  73     {"break", 9},
  74     {"struct", 3},
  75     {"union", 3},
  76     {"enum", 3},
  77     {"default", 8},
  78     {"int", 4},
  79     {"char", 4},
  80     {"float", 4},
  81     {"double", 4},
  82     {"long", 4},
  83     {"short", 4},
  84     {"typedef", 4},
  85     {"unsigned", 4},
  86     {"register", 4},
  87     {"static", 4},
  88     {"global", 4},
  89     {"extern", 4},
  90     {"void", 4},
  91     {"const", 4},
  92     {"volatile", 4},
  93     {"goto", 9},
  94     {"return", 9},
  95     {"if", 5},
  96     {"while", 5},
  97     {"for", 5},
  98     {"else", 6},
  99     {"do", 6},
 100     {"sizeof", 2},
 101     {"offsetof", 1},
 102     {0, 0}
 103 };
 104
 105 char        chartype[128] =
 106 {                               /* this is used to facilitate the decision of
 107                                  * what type (alphanumeric, operator) each
 108                                  * character is */
 109     0, 0, 0, 0, 0, 0, 0, 0,
 110     0, 0, 0, 0, 0, 0, 0, 0,
 111     0, 0, 0, 0, 0, 0, 0, 0,
 112     0, 0, 0, 0, 0, 0, 0, 0,
 113     0, 3, 0, 0, 1, 3, 3, 0,
 114     0, 0, 3, 3, 0, 3, 0, 3,
 115     1, 1, 1, 1, 1, 1, 1, 1,
 116     1, 1, 0, 0, 3, 3, 3, 3,
 117     0, 1, 1, 1, 1, 1, 1, 1,
 118     1, 1, 1, 1, 1, 1, 1, 1,
 119     1, 1, 1, 1, 1, 1, 1, 1,
 120     1, 1, 1, 0, 0, 0, 3, 1,
 121     0, 1, 1, 1, 1, 1, 1, 1,
 122     1, 1, 1, 1, 1, 1, 1, 1,
 123     1, 1, 1, 1, 1, 1, 1, 1,
 124     1, 1, 1, 0, 3, 0, 3, 0
 125 };
 126
 127 int
 128 lexi(void)
 129 {
 130     int         unary_delim;    /* this is set to 1 if the current token
 131                                  * forces a following operator to be unary */
 132     static int  last_code;      /* the last token type returned */
 133     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 134     int         code;           /* internal code to be returned */
 135     char        qchar;          /* the delimiter character for a string */
 136
 137     e_token = s_token;          /* point to start of place to save token */
 138     unary_delim = false;
 139     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 140                                  * column 1 iff the last thing scanned was nl */
 141     ps.last_nl = false;
 142
 143     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 144         ps.col_1 = false;       /* leading blanks imply token is not in column
 145                                  * 1 */
 146         if (++buf_ptr >= buf_end)
 147             fill_buffer();
 148     }
 149
 150     /* Scan an alphanumeric token */
 151     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 152         /*
 153          * we have a character or number
 154          */
 155         const char *j;          /* used for searching thru list of
 156                                  *
 157                                  * reserved words */
 158         struct templ *p;
 159
 160         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 161             enum base {
 162                 BASE_2, BASE_8, BASE_10, BASE_16
 163             };
 164             int         seendot = 0,
 165                         seenexp = 0,
 166                         seensfx = 0;
 167             enum base   in_base = BASE_10;
 168
 169             if (*buf_ptr == '0') {
 170                 if (buf_ptr[1] == 'b' || buf_ptr[1] == 'B')
 171                     in_base = BASE_2;
 172                 else if (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')
 173                     in_base = BASE_16;
 174                 else if (isdigit(buf_ptr[1]))
 175                     in_base = BASE_8;
 176             }
 177             switch (in_base) {
 178             case BASE_2:
 179                 *e_token++ = *buf_ptr++;
 180                 *e_token++ = *buf_ptr++;
 181                 while (*buf_ptr == '0' || *buf_ptr == '1') {
 182                     CHECK_SIZE_TOKEN;
 183                     *e_token++ = *buf_ptr++;
 184                 }
 185                 break;
 186             case BASE_8:
 187                 *e_token++ = *buf_ptr++;
 188                 while (*buf_ptr >= '0' && *buf_ptr <= '8') {
 189                     CHECK_SIZE_TOKEN;
 190                     *e_token++ = *buf_ptr++;
 191                 }
 192                 break;
 193             case BASE_16:
 194                 *e_token++ = *buf_ptr++;
 195                 *e_token++ = *buf_ptr++;
 196                 while (isxdigit(*buf_ptr)) {
 197                     CHECK_SIZE_TOKEN;
 198                     *e_token++ = *buf_ptr++;
 199                 }
 200                 break;
 201             case BASE_10:
 202                 while (1) {
 203                     if (*buf_ptr == '.') {
 204                         if (seendot)
 205                             break;
 206                         else
 207                             seendot++;
 208                     }
 209                     CHECK_SIZE_TOKEN;
 210                     *e_token++ = *buf_ptr++;
 211                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 212                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 213                             break;
 214                         else {
 215                             seenexp++;
 216                             seendot++;
 217                             CHECK_SIZE_TOKEN;
 218                             *e_token++ = *buf_ptr++;
 219                             if (*buf_ptr == '+' || *buf_ptr == '-')
 220                                 *e_token++ = *buf_ptr++;
 221                         }
 222                     }
 223                 }
 224                 break;
 225             }
 226             while (1) {
 227                 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 228                     CHECK_SIZE_TOKEN;
 229                     *e_token++ = *buf_ptr++;
 230                     seensfx |= 1;
 231                     continue;
 232                 }
 233                 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
 234                     CHECK_SIZE_TOKEN;
 235                     if (buf_ptr[1] == buf_ptr[0])
 236                         *e_token++ = *buf_ptr++;
 237                     *e_token++ = *buf_ptr++;
 238                     seensfx |= 2;
 239                     continue;
 240                 }
 241                 break;
 242             }
 243         }
 244         else
 245             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 246                 /* fill_buffer() terminates buffer with newline */
 247                 if (*buf_ptr == BACKSLASH) {
 248                     if (*(buf_ptr + 1) == '\n') {
 249                         buf_ptr += 2;
 250                         if (buf_ptr >= buf_end)
 251                             fill_buffer();
 252                         } else
 253                             break;
 254                 }
 255                 CHECK_SIZE_TOKEN;
 256                 /* copy it over */
 257                 *e_token++ = *buf_ptr++;
 258                 if (buf_ptr >= buf_end)
 259                     fill_buffer();
 260             }
 261         *e_token++ = '\0';
 262
 263         if (s_token[0] == 'L' && s_token[1] == '\0' &&
 264               (*buf_ptr == '"' || *buf_ptr == '\''))
 265             return (strpfx);
 266
 267         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 268             if (++buf_ptr >= buf_end)
 269                 fill_buffer();
 270         }
 271         ps.keyword = 0;
 272         if (l_struct && !ps.p_l_follow) {
 273                                 /* if last token was 'struct' and we're not
 274                                  * in parentheses, then this token
 275                                  * should be treated as a declaration */
 276             l_struct = false;
 277             last_code = ident;
 278             ps.last_u_d = true;
 279             return (decl);
 280         }
 281         ps.last_u_d = l_struct; /* Operator after identifier is binary
 282                                  * unless last token was 'struct' */
 283         l_struct = false;
 284         last_code = ident;      /* Remember that this is the code we will
 285                                  * return */
 286
 287         if (auto_typedefs) {
 288             const char *q = s_token;
 289             size_t q_len = strlen(q);
 290             /* Check if we have an "_t" in the end */
 291             if (q_len > 2 &&
 292                 (strcmp(q + q_len - 2, "_t") == 0)) {
 293                 ps.keyword = 4; /* a type name */
 294                 ps.last_u_d = true;
 295                 goto found_auto_typedef;
 296             }
 297         }
 298
 299         /*
 300          * This loop will check if the token is a keyword.
 301          */
 302         for (p = specials; (j = p->rwd) != NULL; p++) {
 303             const char *q = s_token;    /* point at scanned token */
 304             if (*j++ != *q++ || *j++ != *q++)
 305                 continue;       /* This test depends on the fact that
 306                                  * identifiers are always at least 1 character
 307                                  * long (ie. the first two bytes of the
 308                                  * identifier are always meaningful) */
 309             if (q[-1] == 0)
 310                 break;          /* If its a one-character identifier */
 311             while (*q++ == *j)
 312                 if (*j++ == 0)
 313                     goto found_keyword; /* I wish that C had a multi-level
 314                                          * break... */
 315         }
 316         if (p->rwd) {           /* we have a keyword */
 317     found_keyword:
 318             ps.keyword = p->rwcode;
 319             ps.last_u_d = true;
 320             switch (p->rwcode) {
 321             case 7:             /* it is a switch */
 322                 return (swstmt);
 323             case 8:             /* a case or default */
 324                 return (casestmt);
 325
 326             case 3:             /* a "struct" */
 327                 /*
 328                  * Next time around, we will want to know that we have had a
 329                  * 'struct'
 330                  */
 331                 l_struct = true;
 332                 /* FALLTHROUGH */
 333
 334             case 4:             /* one of the declaration keywords */
 335             found_auto_typedef:
 336                 if (ps.p_l_follow) {
 337                     /* inside parens: cast, param list, offsetof or sizeof */
 338                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
 339                     break;
 340                 }
 341                 last_code = decl;
 342                 return (decl);
 343
 344             case 5:             /* if, while, for */
 345                 return (sp_paren);
 346
 347             case 6:             /* do, else */
 348                 return (sp_nparen);
 349
 350             default:            /* all others are treated like any other
 351                                  * identifier */
 352                 return (ident);
 353             }                   /* end of switch */
 354         }                       /* end of if (found_it) */
 355         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 356             char *tp = buf_ptr;
 357             while (tp < buf_end)
 358                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 359                     goto not_proc;
 360             strncpy(ps.procname, token, sizeof ps.procname - 1);
 361             ps.in_parameter_declaration = 1;
 362             rparen_count = 1;
 363     not_proc:;
 364         }
 365         /*
 366          * The following hack attempts to guess whether or not the current
 367          * token is in fact a declaration keyword -- one that has been
 368          * typedefd
 369          */
 370         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 371                 && !ps.p_l_follow
 372                 && !ps.block_init
 373                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 374                     ps.last_token == decl ||
 375                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 376             ps.keyword = 4;     /* a type name */
 377             ps.last_u_d = true;
 378             last_code = decl;
 379             return decl;
 380         }
 381         if (last_code == decl)  /* if this is a declared variable, then
 382                                  * following sign is unary */
 383             ps.last_u_d = true; /* will make "int a -1" work */
 384         last_code = ident;
 385         return (ident);         /* the ident is not in the list */
 386     }                           /* end of procesing for alpanum character */
 387
 388     /* Scan a non-alphanumeric token */
 389
 390     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 391                                  * moved here */
 392     *e_token = '\0';
 393     if (++buf_ptr >= buf_end)
 394         fill_buffer();
 395
 396     switch (*token) {
 397     case '\n':
 398         unary_delim = ps.last_u_d;
 399         ps.last_nl = true;      /* remember that we just had a newline */
 400         code = (had_eof ? 0 : newline);
 401
 402         /*
 403          * if data has been exhausted, the newline is a dummy, and we should
 404          * return code to stop
 405          */
 406         break;
 407
 408     case '\'':                  /* start of quoted character */
 409     case '"':                   /* start of string */
 410         qchar = *token;
 411         if (troff) {
 412             e_token[-1] = '`';
 413             if (qchar == '"')
 414                 *e_token++ = '`';
 415             e_token = chfont(&bodyf, &stringf, e_token);
 416         }
 417         do {                    /* copy the string */
 418             while (1) {         /* move one character or [/<char>]<char> */
 419                 if (*buf_ptr == '\n') {
 420                     diag2(1, "Unterminated literal");
 421                     goto stop_lit;
 422                 }
 423                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 424                                          * since CHECK_SIZE guarantees that there
 425                                          * are at least 5 entries left */
 426                 *e_token = *buf_ptr++;
 427                 if (buf_ptr >= buf_end)
 428                     fill_buffer();
 429                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 430                     if (*buf_ptr == '\n')       /* check for escaped newline */
 431                         ++line_no;
 432                     if (troff) {
 433                         *++e_token = BACKSLASH;
 434                         if (*buf_ptr == BACKSLASH)
 435                             *++e_token = BACKSLASH;
 436                     }
 437                     *++e_token = *buf_ptr++;
 438                     ++e_token;  /* we must increment this again because we
 439                                  * copied two chars */
 440                     if (buf_ptr >= buf_end)
 441                         fill_buffer();
 442                 }
 443                 else
 444                     break;      /* we copied one character */
 445             }                   /* end of while (1) */
 446         } while (*e_token++ != qchar);
 447         if (troff) {
 448             e_token = chfont(&stringf, &bodyf, e_token - 1);
 449             if (qchar == '"')
 450                 *e_token++ = '\'';
 451         }
 452 stop_lit:
 453         code = ident;
 454         break;
 455
 456     case ('('):
 457     case ('['):
 458         unary_delim = true;
 459         code = lparen;
 460         break;
 461
 462     case (')'):
 463     case (']'):
 464         code = rparen;
 465         break;
 466
 467     case '#':
 468         unary_delim = ps.last_u_d;
 469         code = preesc;
 470         break;
 471
 472     case '?':
 473         unary_delim = true;
 474         code = question;
 475         break;
 476
 477     case (':'):
 478         code = colon;
 479         unary_delim = true;
 480         break;
 481
 482     case (';'):
 483         unary_delim = true;
 484         code = semicolon;
 485         break;
 486
 487     case ('{'):
 488         unary_delim = true;
 489
 490         /*
 491          * if (ps.in_or_st) ps.block_init = 1;
 492          */
 493         /* ?    code = ps.block_init ? lparen : lbrace; */
 494         code = lbrace;
 495         break;
 496
 497     case ('}'):
 498         unary_delim = true;
 499         /* ?    code = ps.block_init ? rparen : rbrace; */
 500         code = rbrace;
 501         break;
 502
 503     case 014:                   /* a form feed */
 504         unary_delim = ps.last_u_d;
 505         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 506                                  * right */
 507         code = form_feed;
 508         break;
 509
 510     case (','):
 511         unary_delim = true;
 512         code = comma;
 513         break;
 514
 515     case '.':
 516         unary_delim = false;
 517         code = period;
 518         break;
 519
 520     case '-':
 521     case '+':                   /* check for -, +, --, ++ */
 522         code = (ps.last_u_d ? unary_op : binary_op);
 523         unary_delim = true;
 524
 525         if (*buf_ptr == token[0]) {
 526             /* check for doubled character */
 527             *e_token++ = *buf_ptr++;
 528             /* buffer overflow will be checked at end of loop */
 529             if (last_code == ident || last_code == rparen) {
 530                 code = (ps.last_u_d ? unary_op : postop);
 531                 /* check for following ++ or -- */
 532                 unary_delim = false;
 533             }
 534         }
 535         else if (*buf_ptr == '=')
 536             /* check for operator += */
 537             *e_token++ = *buf_ptr++;
 538         else if (*buf_ptr == '>') {
 539             /* check for operator -> */
 540             *e_token++ = *buf_ptr++;
 541             if (!pointer_as_binop) {
 542                 unary_delim = false;
 543                 code = unary_op;
 544                 ps.want_blank = false;
 545             }
 546         }
 547         break;                  /* buffer overflow will be checked at end of
 548                                  * switch */
 549
 550     case '=':
 551         if (ps.in_or_st)
 552             ps.block_init = 1;
 553 #ifdef undef
 554         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 555             e_token[-1] = *buf_ptr++;
 556             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 557                 *e_token++ = *buf_ptr++;
 558             *e_token++ = '=';   /* Flip =+ to += */
 559             *e_token = 0;
 560         }
 561 #else
 562         if (*buf_ptr == '=') {/* == */
 563             *e_token++ = '=';   /* Flip =+ to += */
 564             buf_ptr++;
 565             *e_token = 0;
 566         }
 567 #endif
 568         code = binary_op;
 569         unary_delim = true;
 570         break;
 571         /* can drop thru!!! */
 572
 573     case '>':
 574     case '<':
 575     case '!':                   /* ops like <, <<, <=, !=, etc */
 576         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 577             *e_token++ = *buf_ptr;
 578             if (++buf_ptr >= buf_end)
 579                 fill_buffer();
 580         }
 581         if (*buf_ptr == '=')
 582             *e_token++ = *buf_ptr++;
 583         code = (ps.last_u_d ? unary_op : binary_op);
 584         unary_delim = true;
 585         break;
 586
 587     default:
 588         if (token[0] == '/' && *buf_ptr == '*') {
 589             /* it is start of comment */
 590             *e_token++ = '*';
 591
 592             if (++buf_ptr >= buf_end)
 593                 fill_buffer();
 594
 595             code = comment;
 596             unary_delim = ps.last_u_d;
 597             break;
 598         }
 599         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 600             /*
 601              * handle ||, &&, etc, and also things as in int *****i
 602              */
 603             *e_token++ = *buf_ptr;
 604             if (++buf_ptr >= buf_end)
 605                 fill_buffer();
 606         }
 607         code = (ps.last_u_d ? unary_op : binary_op);
 608         unary_delim = true;
 609
 610
 611     }                           /* end of switch */
 612     if (code != newline) {
 613         l_struct = false;
 614         last_code = code;
 615     }
 616     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 617         fill_buffer();
 618     ps.last_u_d = unary_delim;
 619     *e_token = '\0';            /* null terminate the token */
 620     return (code);
 621 }
 622
 623 /*
 624  * Add the given keyword to the keyword table, using val as the keyword type
 625  */
 626 void
 627 addkey(char *key, int val)
 628 {
 629     struct templ *p = specials;
 630     while (p->rwd)
 631         if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 632             return;
 633         else
 634             p++;
 635     if (p >= specials + sizeof specials / sizeof specials[0])
 636         return;                 /* For now, table overflows are silently
 637                                  * ignored */
 638     p->rwd = key;
 639     p->rwcode = val;
 640     p[1].rwd = NULL;
 641     p[1].rwcode = 0;
 642 }