usr.bin/indent/lexi.c

   1 /*-
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. All advertising materials mentioning features or use of this software
  16  *    must display the following acknowledgement:
  17  *      This product includes software developed by the University of
  18  *      California, Berkeley and its contributors.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35
  36 #if 0
  37 #ifndef lint
  38 static char sccsid[] = "@(#)lexi.c      8.1 (Berkeley) 6/6/93";
  39 #endif /* not lint */
  40 #endif
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 /*
  45  * Here we have the token scanner for indent.  It scans off one token and puts
  46  * it in the global variable "token".  It returns a code, indicating the type
  47  * of token scanned.
  48  */
  49
  50 #include <err.h>
  51 #include <stdio.h>
  52 #include <ctype.h>
  53 #include <stdlib.h>
  54 #include <string.h>
  55 #include "indent_globs.h"
  56 #include "indent_codes.h"
  57 #include "indent.h"
  58
  59 #define alphanum 1
  60 #define opchar 3
  61
  62 struct templ {
  63     const char *rwd;
  64     int         rwcode;
  65 };
  66
  67 struct templ specials[1000] =
  68 {
  69     {"switch", 1},
  70     {"case", 2},
  71     {"break", 0},
  72     {"struct", 3},
  73     {"union", 3},
  74     {"enum", 3},
  75     {"default", 2},
  76     {"int", 4},
  77     {"char", 4},
  78     {"float", 4},
  79     {"double", 4},
  80     {"long", 4},
  81     {"short", 4},
  82     {"typedef", 4},
  83     {"unsigned", 4},
  84     {"register", 4},
  85     {"static", 4},
  86     {"global", 4},
  87     {"extern", 4},
  88     {"void", 4},
  89     {"const", 4},
  90     {"volatile", 4},
  91     {"goto", 0},
  92     {"return", 0},
  93     {"if", 5},
  94     {"while", 5},
  95     {"for", 5},
  96     {"else", 6},
  97     {"do", 6},
  98     {"sizeof", 7},
  99     {0, 0}
 100 };
 101
 102 char        chartype[128] =
 103 {                               /* this is used to facilitate the decision of
 104                                  * what type (alphanumeric, operator) each
 105                                  * character is */
 106     0, 0, 0, 0, 0, 0, 0, 0,
 107     0, 0, 0, 0, 0, 0, 0, 0,
 108     0, 0, 0, 0, 0, 0, 0, 0,
 109     0, 0, 0, 0, 0, 0, 0, 0,
 110     0, 3, 0, 0, 1, 3, 3, 0,
 111     0, 0, 3, 3, 0, 3, 0, 3,
 112     1, 1, 1, 1, 1, 1, 1, 1,
 113     1, 1, 0, 0, 3, 3, 3, 3,
 114     0, 1, 1, 1, 1, 1, 1, 1,
 115     1, 1, 1, 1, 1, 1, 1, 1,
 116     1, 1, 1, 1, 1, 1, 1, 1,
 117     1, 1, 1, 0, 0, 0, 3, 1,
 118     0, 1, 1, 1, 1, 1, 1, 1,
 119     1, 1, 1, 1, 1, 1, 1, 1,
 120     1, 1, 1, 1, 1, 1, 1, 1,
 121     1, 1, 1, 0, 3, 0, 3, 0
 122 };
 123
 124 int
 125 lexi(void)
 126 {
 127     int         unary_delim;    /* this is set to 1 if the current token
 128                                  * forces a following operator to be unary */
 129     static int  last_code;      /* the last token type returned */
 130     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 131     int         code;           /* internal code to be returned */
 132     char        qchar;          /* the delimiter character for a string */
 133
 134     e_token = s_token;          /* point to start of place to save token */
 135     unary_delim = false;
 136     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 137                                  * column 1 iff the last thing scanned was nl */
 138     ps.last_nl = false;
 139
 140     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 141         ps.col_1 = false;       /* leading blanks imply token is not in column
 142                                  * 1 */
 143         if (++buf_ptr >= buf_end)
 144             fill_buffer();
 145     }
 146
 147     /* Scan an alphanumeric token */
 148     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 149         /*
 150          * we have a character or number
 151          */
 152         const char *j;          /* used for searching thru list of
 153                                  *
 154                                  * reserved words */
 155         struct templ *p;
 156
 157         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 158             int         seendot = 0,
 159                         seenexp = 0,
 160                         seensfx = 0;
 161             if (*buf_ptr == '0' &&
 162                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 163                 *e_token++ = *buf_ptr++;
 164                 *e_token++ = *buf_ptr++;
 165                 while (isxdigit(*buf_ptr)) {
 166                     CHECK_SIZE_TOKEN;
 167                     *e_token++ = *buf_ptr++;
 168                 }
 169             }
 170             else
 171                 while (1) {
 172                     if (*buf_ptr == '.') {
 173                         if (seendot)
 174                             break;
 175                         else
 176                             seendot++;
 177                     }
 178                     CHECK_SIZE_TOKEN;
 179                     *e_token++ = *buf_ptr++;
 180                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 181                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 182                             break;
 183                         else {
 184                             seenexp++;
 185                             seendot++;
 186                             CHECK_SIZE_TOKEN;
 187                             *e_token++ = *buf_ptr++;
 188                             if (*buf_ptr == '+' || *buf_ptr == '-')
 189                                 *e_token++ = *buf_ptr++;
 190                         }
 191                     }
 192                 }
 193             while (1) {
 194                 if (!(seensfx & 1) && (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 195                     CHECK_SIZE_TOKEN;
 196                     *e_token++ = *buf_ptr++;
 197                     seensfx |= 1;
 198                     continue;
 199                 }
 200                 if (!(seensfx & 2) && (strchr("fFlL", *buf_ptr) != NULL)) {
 201                     CHECK_SIZE_TOKEN;
 202                     if (buf_ptr[1] == buf_ptr[0])
 203                         *e_token++ = *buf_ptr++;
 204                     *e_token++ = *buf_ptr++;
 205                     seensfx |= 2;
 206                     continue;
 207                 }
 208                 break;
 209             }
 210         }
 211         else
 212             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 213                 /* fill_buffer() terminates buffer with newline */
 214                 if (*buf_ptr == BACKSLASH) {
 215                     if (*(buf_ptr + 1) == '\n') {
 216                         buf_ptr += 2;
 217                         if (buf_ptr >= buf_end)
 218                             fill_buffer();
 219                         } else
 220                             break;
 221                 }
 222                 CHECK_SIZE_TOKEN;
 223                 /* copy it over */
 224                 *e_token++ = *buf_ptr++;
 225                 if (buf_ptr >= buf_end)
 226                     fill_buffer();
 227             }
 228         *e_token++ = '\0';
 229         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 230             if (++buf_ptr >= buf_end)
 231                 fill_buffer();
 232         }
 233         ps.its_a_keyword = false;
 234         ps.sizeof_keyword = false;
 235         if (l_struct && !ps.p_l_follow) {
 236                                 /* if last token was 'struct' and we're not
 237                                  * in parentheses, then this token
 238                                  * should be treated as a declaration */
 239             l_struct = false;
 240             last_code = ident;
 241             ps.last_u_d = true;
 242             return (decl);
 243         }
 244         ps.last_u_d = l_struct; /* Operator after identifier is binary
 245                                  * unless last token was 'struct' */
 246         l_struct = false;
 247         last_code = ident;      /* Remember that this is the code we will
 248                                  * return */
 249
 250         if (auto_typedefs) {
 251             const char *q = s_token;
 252             size_t q_len = strlen(q);
 253             /* Check if we have an "_t" in the end */
 254             if (q_len > 2 &&
 255                 (strcmp(q + q_len - 2, "_t") == 0)) {
 256                 ps.its_a_keyword = true;
 257                 ps.last_u_d = true;
 258                 goto found_auto_typedef;
 259             }
 260         }
 261
 262         /*
 263          * This loop will check if the token is a keyword.
 264          */
 265         for (p = specials; (j = p->rwd) != NULL; p++) {
 266             const char *q = s_token;    /* point at scanned token */
 267             if (*j++ != *q++ || *j++ != *q++)
 268                 continue;       /* This test depends on the fact that
 269                                  * identifiers are always at least 1 character
 270                                  * long (ie. the first two bytes of the
 271                                  * identifier are always meaningful) */
 272             if (q[-1] == 0)
 273                 break;          /* If its a one-character identifier */
 274             while (*q++ == *j)
 275                 if (*j++ == 0)
 276                     goto found_keyword; /* I wish that C had a multi-level
 277                                          * break... */
 278         }
 279         if (p->rwd) {           /* we have a keyword */
 280     found_keyword:
 281             ps.its_a_keyword = true;
 282             ps.last_u_d = true;
 283             switch (p->rwcode) {
 284             case 1:             /* it is a switch */
 285                 return (swstmt);
 286             case 2:             /* a case or default */
 287                 return (casestmt);
 288
 289             case 3:             /* a "struct" */
 290                 /*
 291                  * Next time around, we will want to know that we have had a
 292                  * 'struct'
 293                  */
 294                 l_struct = true;
 295                 /* FALLTHROUGH */
 296
 297             case 4:             /* one of the declaration keywords */
 298             found_auto_typedef:
 299                 if (ps.p_l_follow) {
 300                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
 301                     break;      /* inside parens: cast, param list or sizeof */
 302                 }
 303                 last_code = decl;
 304                 return (decl);
 305
 306             case 5:             /* if, while, for */
 307                 return (sp_paren);
 308
 309             case 6:             /* do, else */
 310                 return (sp_nparen);
 311
 312             case 7:
 313                 ps.sizeof_keyword = true;
 314             default:            /* all others are treated like any other
 315                                  * identifier */
 316                 return (ident);
 317             }                   /* end of switch */
 318         }                       /* end of if (found_it) */
 319         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 320             char *tp = buf_ptr;
 321             while (tp < buf_end)
 322                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 323                     goto not_proc;
 324             strncpy(ps.procname, token, sizeof ps.procname - 1);
 325             ps.in_parameter_declaration = 1;
 326             rparen_count = 1;
 327     not_proc:;
 328         }
 329         /*
 330          * The following hack attempts to guess whether or not the current
 331          * token is in fact a declaration keyword -- one that has been
 332          * typedefd
 333          */
 334         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 335                 && !ps.p_l_follow
 336                 && !ps.block_init
 337                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 338                     ps.last_token == decl ||
 339                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 340             ps.its_a_keyword = true;
 341             ps.last_u_d = true;
 342             last_code = decl;
 343             return decl;
 344         }
 345         if (last_code == decl)  /* if this is a declared variable, then
 346                                  * following sign is unary */
 347             ps.last_u_d = true; /* will make "int a -1" work */
 348         last_code = ident;
 349         return (ident);         /* the ident is not in the list */
 350     }                           /* end of procesing for alpanum character */
 351
 352     /* Scan a non-alphanumeric token */
 353
 354     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 355                                  * moved here */
 356     *e_token = '\0';
 357     if (++buf_ptr >= buf_end)
 358         fill_buffer();
 359
 360     switch (*token) {
 361     case '\n':
 362         unary_delim = ps.last_u_d;
 363         ps.last_nl = true;      /* remember that we just had a newline */
 364         code = (had_eof ? 0 : newline);
 365
 366         /*
 367          * if data has been exhausted, the newline is a dummy, and we should
 368          * return code to stop
 369          */
 370         break;
 371
 372     case '\'':                  /* start of quoted character */
 373     case '"':                   /* start of string */
 374         qchar = *token;
 375         if (troff) {
 376             e_token[-1] = '`';
 377             if (qchar == '"')
 378                 *e_token++ = '`';
 379             e_token = chfont(&bodyf, &stringf, e_token);
 380         }
 381         do {                    /* copy the string */
 382             while (1) {         /* move one character or [/<char>]<char> */
 383                 if (*buf_ptr == '\n') {
 384                     diag2(1, "Unterminated literal");
 385                     goto stop_lit;
 386                 }
 387                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 388                                          * since CHECK_SIZE guarantees that there
 389                                          * are at least 5 entries left */
 390                 *e_token = *buf_ptr++;
 391                 if (buf_ptr >= buf_end)
 392                     fill_buffer();
 393                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 394                     if (*buf_ptr == '\n')       /* check for escaped newline */
 395                         ++line_no;
 396                     if (troff) {
 397                         *++e_token = BACKSLASH;
 398                         if (*buf_ptr == BACKSLASH)
 399                             *++e_token = BACKSLASH;
 400                     }
 401                     *++e_token = *buf_ptr++;
 402                     ++e_token;  /* we must increment this again because we
 403                                  * copied two chars */
 404                     if (buf_ptr >= buf_end)
 405                         fill_buffer();
 406                 }
 407                 else
 408                     break;      /* we copied one character */
 409             }                   /* end of while (1) */
 410         } while (*e_token++ != qchar);
 411         if (troff) {
 412             e_token = chfont(&stringf, &bodyf, e_token - 1);
 413             if (qchar == '"')
 414                 *e_token++ = '\'';
 415         }
 416 stop_lit:
 417         code = ident;
 418         break;
 419
 420     case ('('):
 421     case ('['):
 422         unary_delim = true;
 423         code = lparen;
 424         break;
 425
 426     case (')'):
 427     case (']'):
 428         code = rparen;
 429         break;
 430
 431     case '#':
 432         unary_delim = ps.last_u_d;
 433         code = preesc;
 434         break;
 435
 436     case '?':
 437         unary_delim = true;
 438         code = question;
 439         break;
 440
 441     case (':'):
 442         code = colon;
 443         unary_delim = true;
 444         break;
 445
 446     case (';'):
 447         unary_delim = true;
 448         code = semicolon;
 449         break;
 450
 451     case ('{'):
 452         unary_delim = true;
 453
 454         /*
 455          * if (ps.in_or_st) ps.block_init = 1;
 456          */
 457         /* ?    code = ps.block_init ? lparen : lbrace; */
 458         code = lbrace;
 459         break;
 460
 461     case ('}'):
 462         unary_delim = true;
 463         /* ?    code = ps.block_init ? rparen : rbrace; */
 464         code = rbrace;
 465         break;
 466
 467     case 014:                   /* a form feed */
 468         unary_delim = ps.last_u_d;
 469         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 470                                  * right */
 471         code = form_feed;
 472         break;
 473
 474     case (','):
 475         unary_delim = true;
 476         code = comma;
 477         break;
 478
 479     case '.':
 480         unary_delim = false;
 481         code = period;
 482         break;
 483
 484     case '-':
 485     case '+':                   /* check for -, +, --, ++ */
 486         code = (ps.last_u_d ? unary_op : binary_op);
 487         unary_delim = true;
 488
 489         if (*buf_ptr == token[0]) {
 490             /* check for doubled character */
 491             *e_token++ = *buf_ptr++;
 492             /* buffer overflow will be checked at end of loop */
 493             if (last_code == ident || last_code == rparen) {
 494                 code = (ps.last_u_d ? unary_op : postop);
 495                 /* check for following ++ or -- */
 496                 unary_delim = false;
 497             }
 498         }
 499         else if (*buf_ptr == '=')
 500             /* check for operator += */
 501             *e_token++ = *buf_ptr++;
 502         else if (*buf_ptr == '>') {
 503             /* check for operator -> */
 504             *e_token++ = *buf_ptr++;
 505             if (!pointer_as_binop) {
 506                 unary_delim = false;
 507                 code = unary_op;
 508                 ps.want_blank = false;
 509             }
 510         }
 511         break;                  /* buffer overflow will be checked at end of
 512                                  * switch */
 513
 514     case '=':
 515         if (ps.in_or_st)
 516             ps.block_init = 1;
 517 #ifdef undef
 518         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 519             e_token[-1] = *buf_ptr++;
 520             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 521                 *e_token++ = *buf_ptr++;
 522             *e_token++ = '=';   /* Flip =+ to += */
 523             *e_token = 0;
 524         }
 525 #else
 526         if (*buf_ptr == '=') {/* == */
 527             *e_token++ = '=';   /* Flip =+ to += */
 528             buf_ptr++;
 529             *e_token = 0;
 530         }
 531 #endif
 532         code = binary_op;
 533         unary_delim = true;
 534         break;
 535         /* can drop thru!!! */
 536
 537     case '>':
 538     case '<':
 539     case '!':                   /* ops like <, <<, <=, !=, etc */
 540         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 541             *e_token++ = *buf_ptr;
 542             if (++buf_ptr >= buf_end)
 543                 fill_buffer();
 544         }
 545         if (*buf_ptr == '=')
 546             *e_token++ = *buf_ptr++;
 547         code = (ps.last_u_d ? unary_op : binary_op);
 548         unary_delim = true;
 549         break;
 550
 551     default:
 552         if (token[0] == '/' && *buf_ptr == '*') {
 553             /* it is start of comment */
 554             *e_token++ = '*';
 555
 556             if (++buf_ptr >= buf_end)
 557                 fill_buffer();
 558
 559             code = comment;
 560             unary_delim = ps.last_u_d;
 561             break;
 562         }
 563         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 564             /*
 565              * handle ||, &&, etc, and also things as in int *****i
 566              */
 567             *e_token++ = *buf_ptr;
 568             if (++buf_ptr >= buf_end)
 569                 fill_buffer();
 570         }
 571         code = (ps.last_u_d ? unary_op : binary_op);
 572         unary_delim = true;
 573
 574
 575     }                           /* end of switch */
 576     if (code != newline) {
 577         l_struct = false;
 578         last_code = code;
 579     }
 580     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 581         fill_buffer();
 582     ps.last_u_d = unary_delim;
 583     *e_token = '\0';            /* null terminate the token */
 584     return (code);
 585 }
 586
 587 /*
 588  * Add the given keyword to the keyword table, using val as the keyword type
 589  */
 590 void
 591 addkey(char *key, int val)
 592 {
 593     struct templ *p = specials;
 594     while (p->rwd)
 595         if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 596             return;
 597         else
 598             p++;
 599     if (p >= specials + sizeof specials / sizeof specials[0])
 600         return;                 /* For now, table overflows are silently
 601                                  * ignored */
 602     p->rwd = key;
 603     p->rwcode = val;
 604     p[1].rwd = NULL;
 605     p[1].rwcode = 0;
 606 }