usr.bin/indent/lexi.c

   1 /*
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. All advertising materials mentioning features or use of this software
  16  *    must display the following acknowledgement:
  17  *      This product includes software developed by the University of
  18  *      California, Berkeley and its contributors.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35
  36 #if 0
  37 #ifndef lint
  38 static char sccsid[] = "@(#)lexi.c      8.1 (Berkeley) 6/6/93";
  39 #endif /* not lint */
  40 #endif
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 /*
  45  * Here we have the token scanner for indent.  It scans off one token and puts
  46  * it in the global variable "token".  It returns a code, indicating the type
  47  * of token scanned.
  48  */
  49
  50 #include <err.h>
  51 #include <stdio.h>
  52 #include <ctype.h>
  53 #include <stdlib.h>
  54 #include <string.h>
  55 #include "indent_globs.h"
  56 #include "indent_codes.h"
  57 #include "indent.h"
  58
  59 #define alphanum 1
  60 #define opchar 3
  61
  62 struct templ {
  63     const char *rwd;
  64     int         rwcode;
  65 };
  66
  67 struct templ specials[1000] =
  68 {
  69     {"switch", 1},
  70     {"case", 2},
  71     {"break", 0},
  72     {"struct", 3},
  73     {"union", 3},
  74     {"enum", 3},
  75     {"default", 2},
  76     {"int", 4},
  77     {"char", 4},
  78     {"float", 4},
  79     {"double", 4},
  80     {"long", 4},
  81     {"short", 4},
  82     {"typdef", 4},
  83     {"unsigned", 4},
  84     {"register", 4},
  85     {"static", 4},
  86     {"global", 4},
  87     {"extern", 4},
  88     {"void", 4},
  89     {"const", 4},
  90     {"volatile", 4},
  91     {"goto", 0},
  92     {"return", 0},
  93     {"if", 5},
  94     {"while", 5},
  95     {"for", 5},
  96     {"else", 6},
  97     {"do", 6},
  98     {"sizeof", 7},
  99     {0, 0}
 100 };
 101
 102 char        chartype[128] =
 103 {                               /* this is used to facilitate the decision of
 104                                  * what type (alphanumeric, operator) each
 105                                  * character is */
 106     0, 0, 0, 0, 0, 0, 0, 0,
 107     0, 0, 0, 0, 0, 0, 0, 0,
 108     0, 0, 0, 0, 0, 0, 0, 0,
 109     0, 0, 0, 0, 0, 0, 0, 0,
 110     0, 3, 0, 0, 1, 3, 3, 0,
 111     0, 0, 3, 3, 0, 3, 0, 3,
 112     1, 1, 1, 1, 1, 1, 1, 1,
 113     1, 1, 0, 0, 3, 3, 3, 3,
 114     0, 1, 1, 1, 1, 1, 1, 1,
 115     1, 1, 1, 1, 1, 1, 1, 1,
 116     1, 1, 1, 1, 1, 1, 1, 1,
 117     1, 1, 1, 0, 0, 0, 3, 1,
 118     0, 1, 1, 1, 1, 1, 1, 1,
 119     1, 1, 1, 1, 1, 1, 1, 1,
 120     1, 1, 1, 1, 1, 1, 1, 1,
 121     1, 1, 1, 0, 3, 0, 3, 0
 122 };
 123
 124 int
 125 lexi(void)
 126 {
 127     int         unary_delim;    /* this is set to 1 if the current token
 128                                  * forces a following operator to be unary */
 129     static int  last_code;      /* the last token type returned */
 130     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 131     int         code;           /* internal code to be returned */
 132     char        qchar;          /* the delimiter character for a string */
 133
 134     e_token = s_token;          /* point to start of place to save token */
 135     unary_delim = false;
 136     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 137                                  * column 1 iff the last thing scanned was nl */
 138     ps.last_nl = false;
 139
 140     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 141         ps.col_1 = false;       /* leading blanks imply token is not in column
 142                                  * 1 */
 143         if (++buf_ptr >= buf_end)
 144             fill_buffer();
 145     }
 146
 147     /* Scan an alphanumeric token */
 148     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 149         /*
 150          * we have a character or number
 151          */
 152         const char *j;          /* used for searching thru list of
 153                                  *
 154                                  * reserved words */
 155         struct templ *p;
 156
 157         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 158             int         seendot = 0,
 159                         seenexp = 0,
 160                         seensfx = 0;
 161             if (*buf_ptr == '0' &&
 162                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 163                 *e_token++ = *buf_ptr++;
 164                 *e_token++ = *buf_ptr++;
 165                 while (isxdigit(*buf_ptr)) {
 166                     CHECK_SIZE_TOKEN;
 167                     *e_token++ = *buf_ptr++;
 168                 }
 169             }
 170             else
 171                 while (1) {
 172                     if (*buf_ptr == '.') {
 173                         if (seendot)
 174                             break;
 175                         else
 176                             seendot++;
 177                     }
 178                     CHECK_SIZE_TOKEN;
 179                     *e_token++ = *buf_ptr++;
 180                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 181                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 182                             break;
 183                         else {
 184                             seenexp++;
 185                             seendot++;
 186                             CHECK_SIZE_TOKEN;
 187                             *e_token++ = *buf_ptr++;
 188                             if (*buf_ptr == '+' || *buf_ptr == '-')
 189                                 *e_token++ = *buf_ptr++;
 190                         }
 191                     }
 192                 }
 193             while (1) {
 194                 if (!(seensfx & 1) &&
 195                         (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 196                     CHECK_SIZE_TOKEN;
 197                     *e_token++ = *buf_ptr++;
 198                     seensfx |= 1;
 199                     continue;
 200                 }
 201                 if (!(seensfx & 2) &&
 202                         (*buf_ptr == 'L' || *buf_ptr == 'l')) {
 203                     CHECK_SIZE_TOKEN;
 204                     if (buf_ptr[1] == buf_ptr[0])
 205                         *e_token++ = *buf_ptr++;
 206                     *e_token++ = *buf_ptr++;
 207                     seensfx |= 2;
 208                     continue;
 209                 }
 210                 break;
 211             }
 212         }
 213         else
 214             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 215                 /* fill_buffer() terminates buffer with newline */
 216                 if (*buf_ptr == BACKSLASH) {
 217                     if (*(buf_ptr + 1) == '\n') {
 218                         buf_ptr += 2;
 219                         if (buf_ptr >= buf_end)
 220                             fill_buffer();
 221                         } else
 222                             break;
 223                 }
 224                 CHECK_SIZE_TOKEN;
 225                 /* copy it over */
 226                 *e_token++ = *buf_ptr++;
 227                 if (buf_ptr >= buf_end)
 228                     fill_buffer();
 229             }
 230         *e_token++ = '\0';
 231         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 232             if (++buf_ptr >= buf_end)
 233                 fill_buffer();
 234         }
 235         ps.its_a_keyword = false;
 236         ps.sizeof_keyword = false;
 237         if (l_struct && !ps.p_l_follow) {
 238                                 /* if last token was 'struct' and we're not
 239                                  * in parentheses, then this token
 240                                  * should be treated as a declaration */
 241             l_struct = false;
 242             last_code = ident;
 243             ps.last_u_d = true;
 244             return (decl);
 245         }
 246         ps.last_u_d = l_struct; /* Operator after identifier is binary
 247                                  * unless last token was 'struct' */
 248         l_struct = false;
 249         last_code = ident;      /* Remember that this is the code we will
 250                                  * return */
 251
 252         if (auto_typedefs) {
 253             const char *q = s_token;
 254             /* Check if we have an "_t" in the end */
 255             if (q[0] && q[1] &&
 256                 (strcmp(q + strlen(q) - 2, "_t") == 0)) {
 257                 ps.its_a_keyword = true;
 258                 ps.last_u_d = true;
 259                 goto found_auto_typedef;
 260             }
 261         }
 262
 263         /*
 264          * This loop will check if the token is a keyword.
 265          */
 266         for (p = specials; (j = p->rwd) != 0; p++) {
 267             const char *q = s_token;    /* point at scanned token */
 268             if (*j++ != *q++ || *j++ != *q++)
 269                 continue;       /* This test depends on the fact that
 270                                  * identifiers are always at least 1 character
 271                                  * long (ie. the first two bytes of the
 272                                  * identifier are always meaningful) */
 273             if (q[-1] == 0)
 274                 break;          /* If its a one-character identifier */
 275             while (*q++ == *j)
 276                 if (*j++ == 0)
 277                     goto found_keyword; /* I wish that C had a multi-level
 278                                          * break... */
 279         }
 280         if (p->rwd) {           /* we have a keyword */
 281     found_keyword:
 282             ps.its_a_keyword = true;
 283             ps.last_u_d = true;
 284             switch (p->rwcode) {
 285             case 1:             /* it is a switch */
 286                 return (swstmt);
 287             case 2:             /* a case or default */
 288                 return (casestmt);
 289
 290             case 3:             /* a "struct" */
 291                 /*
 292                  * Next time around, we will want to know that we have had a
 293                  * 'struct'
 294                  */
 295                 l_struct = true;
 296                 /* FALLTHROUGH */
 297
 298             case 4:             /* one of the declaration keywords */
 299             found_auto_typedef:
 300                 if (ps.p_l_follow) {
 301                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
 302                     break;      /* inside parens: cast, param list or sizeof */
 303                 }
 304                 last_code = decl;
 305                 return (decl);
 306
 307             case 5:             /* if, while, for */
 308                 return (sp_paren);
 309
 310             case 6:             /* do, else */
 311                 return (sp_nparen);
 312
 313             case 7:
 314                 ps.sizeof_keyword = true;
 315             default:            /* all others are treated like any other
 316                                  * identifier */
 317                 return (ident);
 318             }                   /* end of switch */
 319         }                       /* end of if (found_it) */
 320         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 321             char *tp = buf_ptr;
 322             while (tp < buf_end)
 323                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 324                     goto not_proc;
 325             strncpy(ps.procname, token, sizeof ps.procname - 1);
 326             ps.in_parameter_declaration = 1;
 327             rparen_count = 1;
 328     not_proc:;
 329         }
 330         /*
 331          * The following hack attempts to guess whether or not the current
 332          * token is in fact a declaration keyword -- one that has been
 333          * typedefd
 334          */
 335         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 336                 && !ps.p_l_follow
 337                 && !ps.block_init
 338                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 339                     ps.last_token == decl ||
 340                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 341             ps.its_a_keyword = true;
 342             ps.last_u_d = true;
 343             last_code = decl;
 344             return decl;
 345         }
 346         if (last_code == decl)  /* if this is a declared variable, then
 347                                  * following sign is unary */
 348             ps.last_u_d = true; /* will make "int a -1" work */
 349         last_code = ident;
 350         return (ident);         /* the ident is not in the list */
 351     }                           /* end of procesing for alpanum character */
 352
 353     /* Scan a non-alphanumeric token */
 354
 355     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 356                                  * moved here */
 357     *e_token = '\0';
 358     if (++buf_ptr >= buf_end)
 359         fill_buffer();
 360
 361     switch (*token) {
 362     case '\n':
 363         unary_delim = ps.last_u_d;
 364         ps.last_nl = true;      /* remember that we just had a newline */
 365         code = (had_eof ? 0 : newline);
 366
 367         /*
 368          * if data has been exhausted, the newline is a dummy, and we should
 369          * return code to stop
 370          */
 371         break;
 372
 373     case '\'':                  /* start of quoted character */
 374     case '"':                   /* start of string */
 375         qchar = *token;
 376         if (troff) {
 377             e_token[-1] = '`';
 378             if (qchar == '"')
 379                 *e_token++ = '`';
 380             e_token = chfont(&bodyf, &stringf, e_token);
 381         }
 382         do {                    /* copy the string */
 383             while (1) {         /* move one character or [/<char>]<char> */
 384                 if (*buf_ptr == '\n') {
 385                     diag2(1, "Unterminated literal");
 386                     goto stop_lit;
 387                 }
 388                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 389                                          * since CHECK_SIZE guarantees that there
 390                                          * are at least 5 entries left */
 391                 *e_token = *buf_ptr++;
 392                 if (buf_ptr >= buf_end)
 393                     fill_buffer();
 394                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 395                     if (*buf_ptr == '\n')       /* check for escaped newline */
 396                         ++line_no;
 397                     if (troff) {
 398                         *++e_token = BACKSLASH;
 399                         if (*buf_ptr == BACKSLASH)
 400                             *++e_token = BACKSLASH;
 401                     }
 402                     *++e_token = *buf_ptr++;
 403                     ++e_token;  /* we must increment this again because we
 404                                  * copied two chars */
 405                     if (buf_ptr >= buf_end)
 406                         fill_buffer();
 407                 }
 408                 else
 409                     break;      /* we copied one character */
 410             }                   /* end of while (1) */
 411         } while (*e_token++ != qchar);
 412         if (troff) {
 413             e_token = chfont(&stringf, &bodyf, e_token - 1);
 414             if (qchar == '"')
 415                 *e_token++ = '\'';
 416         }
 417 stop_lit:
 418         code = ident;
 419         break;
 420
 421     case ('('):
 422     case ('['):
 423         unary_delim = true;
 424         code = lparen;
 425         break;
 426
 427     case (')'):
 428     case (']'):
 429         code = rparen;
 430         break;
 431
 432     case '#':
 433         unary_delim = ps.last_u_d;
 434         code = preesc;
 435         break;
 436
 437     case '?':
 438         unary_delim = true;
 439         code = question;
 440         break;
 441
 442     case (':'):
 443         code = colon;
 444         unary_delim = true;
 445         break;
 446
 447     case (';'):
 448         unary_delim = true;
 449         code = semicolon;
 450         break;
 451
 452     case ('{'):
 453         unary_delim = true;
 454
 455         /*
 456          * if (ps.in_or_st) ps.block_init = 1;
 457          */
 458         /* ?    code = ps.block_init ? lparen : lbrace; */
 459         code = lbrace;
 460         break;
 461
 462     case ('}'):
 463         unary_delim = true;
 464         /* ?    code = ps.block_init ? rparen : rbrace; */
 465         code = rbrace;
 466         break;
 467
 468     case 014:                   /* a form feed */
 469         unary_delim = ps.last_u_d;
 470         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 471                                  * right */
 472         code = form_feed;
 473         break;
 474
 475     case (','):
 476         unary_delim = true;
 477         code = comma;
 478         break;
 479
 480     case '.':
 481         unary_delim = false;
 482         code = period;
 483         break;
 484
 485     case '-':
 486     case '+':                   /* check for -, +, --, ++ */
 487         code = (ps.last_u_d ? unary_op : binary_op);
 488         unary_delim = true;
 489
 490         if (*buf_ptr == token[0]) {
 491             /* check for doubled character */
 492             *e_token++ = *buf_ptr++;
 493             /* buffer overflow will be checked at end of loop */
 494             if (last_code == ident || last_code == rparen) {
 495                 code = (ps.last_u_d ? unary_op : postop);
 496                 /* check for following ++ or -- */
 497                 unary_delim = false;
 498             }
 499         }
 500         else if (*buf_ptr == '=')
 501             /* check for operator += */
 502             *e_token++ = *buf_ptr++;
 503         else if (*buf_ptr == '>') {
 504             /* check for operator -> */
 505             *e_token++ = *buf_ptr++;
 506             if (!pointer_as_binop) {
 507                 unary_delim = false;
 508                 code = unary_op;
 509                 ps.want_blank = false;
 510             }
 511         }
 512         break;                  /* buffer overflow will be checked at end of
 513                                  * switch */
 514
 515     case '=':
 516         if (ps.in_or_st)
 517             ps.block_init = 1;
 518 #ifdef undef
 519         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 520             e_token[-1] = *buf_ptr++;
 521             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 522                 *e_token++ = *buf_ptr++;
 523             *e_token++ = '=';   /* Flip =+ to += */
 524             *e_token = 0;
 525         }
 526 #else
 527         if (*buf_ptr == '=') {/* == */
 528             *e_token++ = '=';   /* Flip =+ to += */
 529             buf_ptr++;
 530             *e_token = 0;
 531         }
 532 #endif
 533         code = binary_op;
 534         unary_delim = true;
 535         break;
 536         /* can drop thru!!! */
 537
 538     case '>':
 539     case '<':
 540     case '!':                   /* ops like <, <<, <=, !=, etc */
 541         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 542             *e_token++ = *buf_ptr;
 543             if (++buf_ptr >= buf_end)
 544                 fill_buffer();
 545         }
 546         if (*buf_ptr == '=')
 547             *e_token++ = *buf_ptr++;
 548         code = (ps.last_u_d ? unary_op : binary_op);
 549         unary_delim = true;
 550         break;
 551
 552     default:
 553         if (token[0] == '/' && *buf_ptr == '*') {
 554             /* it is start of comment */
 555             *e_token++ = '*';
 556
 557             if (++buf_ptr >= buf_end)
 558                 fill_buffer();
 559
 560             code = comment;
 561             unary_delim = ps.last_u_d;
 562             break;
 563         }
 564         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 565             /*
 566              * handle ||, &&, etc, and also things as in int *****i
 567              */
 568             *e_token++ = *buf_ptr;
 569             if (++buf_ptr >= buf_end)
 570                 fill_buffer();
 571         }
 572         code = (ps.last_u_d ? unary_op : binary_op);
 573         unary_delim = true;
 574
 575
 576     }                           /* end of switch */
 577     if (code != newline) {
 578         l_struct = false;
 579         last_code = code;
 580     }
 581     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 582         fill_buffer();
 583     ps.last_u_d = unary_delim;
 584     *e_token = '\0';            /* null terminate the token */
 585     return (code);
 586 }
 587
 588 /*
 589  * Add the given keyword to the keyword table, using val as the keyword type
 590  */
 591 void
 592 addkey(char *key, int val)
 593 {
 594     struct templ *p = specials;
 595     while (p->rwd)
 596         if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 597             return;
 598         else
 599             p++;
 600     if (p >= specials + sizeof specials / sizeof specials[0])
 601         return;                 /* For now, table overflows are silently
 602                                  * ignored */
 603     p->rwd = key;
 604     p->rwcode = val;
 605     p[1].rwd = 0;
 606     p[1].rwcode = 0;
 607 }