usr.bin/indent/lexi.c

   1 /*
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. All advertising materials mentioning features or use of this software
  16  *    must display the following acknowledgement:
  17  *      This product includes software developed by the University of
  18  *      California, Berkeley and its contributors.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35
  36 #if 0
  37 #ifndef lint
  38 static char sccsid[] = "@(#)lexi.c      8.1 (Berkeley) 6/6/93";
  39 #endif /* not lint */
  40 #endif
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 /*
  45  * Here we have the token scanner for indent.  It scans off one token and puts
  46  * it in the global variable "token".  It returns a code, indicating the type
  47  * of token scanned.
  48  */
  49
  50 #include <err.h>
  51 #include <stdio.h>
  52 #include <ctype.h>
  53 #include <stdlib.h>
  54 #include <string.h>
  55 #include "indent_globs.h"
  56 #include "indent_codes.h"
  57 #include "indent.h"
  58
  59 #define alphanum 1
  60 #define opchar 3
  61
  62 struct templ {
  63     const char *rwd;
  64     int         rwcode;
  65 };
  66
  67 struct templ specials[1000] =
  68 {
  69     {"switch", 1},
  70     {"case", 2},
  71     {"break", 0},
  72     {"struct", 3},
  73     {"union", 3},
  74     {"enum", 3},
  75     {"default", 2},
  76     {"int", 4},
  77     {"char", 4},
  78     {"float", 4},
  79     {"double", 4},
  80     {"long", 4},
  81     {"short", 4},
  82     {"typedef", 4},
  83     {"unsigned", 4},
  84     {"register", 4},
  85     {"static", 4},
  86     {"global", 4},
  87     {"extern", 4},
  88     {"void", 4},
  89     {"const", 4},
  90     {"volatile", 4},
  91     {"goto", 0},
  92     {"return", 0},
  93     {"if", 5},
  94     {"while", 5},
  95     {"for", 5},
  96     {"else", 6},
  97     {"do", 6},
  98     {"sizeof", 7},
  99     {0, 0}
 100 };
 101
 102 char        chartype[128] =
 103 {                               /* this is used to facilitate the decision of
 104                                  * what type (alphanumeric, operator) each
 105                                  * character is */
 106     0, 0, 0, 0, 0, 0, 0, 0,
 107     0, 0, 0, 0, 0, 0, 0, 0,
 108     0, 0, 0, 0, 0, 0, 0, 0,
 109     0, 0, 0, 0, 0, 0, 0, 0,
 110     0, 3, 0, 0, 1, 3, 3, 0,
 111     0, 0, 3, 3, 0, 3, 0, 3,
 112     1, 1, 1, 1, 1, 1, 1, 1,
 113     1, 1, 0, 0, 3, 3, 3, 3,
 114     0, 1, 1, 1, 1, 1, 1, 1,
 115     1, 1, 1, 1, 1, 1, 1, 1,
 116     1, 1, 1, 1, 1, 1, 1, 1,
 117     1, 1, 1, 0, 0, 0, 3, 1,
 118     0, 1, 1, 1, 1, 1, 1, 1,
 119     1, 1, 1, 1, 1, 1, 1, 1,
 120     1, 1, 1, 1, 1, 1, 1, 1,
 121     1, 1, 1, 0, 3, 0, 3, 0
 122 };
 123
 124 int
 125 lexi(void)
 126 {
 127     int         unary_delim;    /* this is set to 1 if the current token
 128                                  * forces a following operator to be unary */
 129     static int  last_code;      /* the last token type returned */
 130     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 131     int         code;           /* internal code to be returned */
 132     char        qchar;          /* the delimiter character for a string */
 133
 134     e_token = s_token;          /* point to start of place to save token */
 135     unary_delim = false;
 136     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 137                                  * column 1 iff the last thing scanned was nl */
 138     ps.last_nl = false;
 139
 140     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 141         ps.col_1 = false;       /* leading blanks imply token is not in column
 142                                  * 1 */
 143         if (++buf_ptr >= buf_end)
 144             fill_buffer();
 145     }
 146
 147     /* Scan an alphanumeric token */
 148     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 149         /*
 150          * we have a character or number
 151          */
 152         const char *j;          /* used for searching thru list of
 153                                  *
 154                                  * reserved words */
 155         struct templ *p;
 156
 157         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 158             int         seendot = 0,
 159                         seenexp = 0,
 160                         seensfx = 0;
 161             if (*buf_ptr == '0' &&
 162                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 163                 *e_token++ = *buf_ptr++;
 164                 *e_token++ = *buf_ptr++;
 165                 while (isxdigit(*buf_ptr)) {
 166                     CHECK_SIZE_TOKEN;
 167                     *e_token++ = *buf_ptr++;
 168                 }
 169             }
 170             else
 171                 while (1) {
 172                     if (*buf_ptr == '.') {
 173                         if (seendot)
 174                             break;
 175                         else
 176                             seendot++;
 177                     }
 178                     CHECK_SIZE_TOKEN;
 179                     *e_token++ = *buf_ptr++;
 180                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 181                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 182                             break;
 183                         else {
 184                             seenexp++;
 185                             seendot++;
 186                             CHECK_SIZE_TOKEN;
 187                             *e_token++ = *buf_ptr++;
 188                             if (*buf_ptr == '+' || *buf_ptr == '-')
 189                                 *e_token++ = *buf_ptr++;
 190                         }
 191                     }
 192                 }
 193             while (1) {
 194                 if (!(seensfx & 1) &&
 195                         (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 196                     CHECK_SIZE_TOKEN;
 197                     *e_token++ = *buf_ptr++;
 198                     seensfx |= 1;
 199                     continue;
 200                 }
 201                 if (!(seensfx & 2) &&
 202                         (*buf_ptr == 'L' || *buf_ptr == 'l')) {
 203                     CHECK_SIZE_TOKEN;
 204                     if (buf_ptr[1] == buf_ptr[0])
 205                         *e_token++ = *buf_ptr++;
 206                     *e_token++ = *buf_ptr++;
 207                     seensfx |= 2;
 208                     continue;
 209                 }
 210                 break;
 211             }
 212         }
 213         else
 214             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 215                 /* fill_buffer() terminates buffer with newline */
 216                 if (*buf_ptr == BACKSLASH) {
 217                     if (*(buf_ptr + 1) == '\n') {
 218                         buf_ptr += 2;
 219                         if (buf_ptr >= buf_end)
 220                             fill_buffer();
 221                         } else
 222                             break;
 223                 }
 224                 CHECK_SIZE_TOKEN;
 225                 /* copy it over */
 226                 *e_token++ = *buf_ptr++;
 227                 if (buf_ptr >= buf_end)
 228                     fill_buffer();
 229             }
 230         *e_token++ = '\0';
 231
 232         if (s_token[0] == 'L' && s_token[1] == '\0' &&
 233               (*buf_ptr == '"' || *buf_ptr == '\''))
 234             return (strpfx);
 235
 236         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 237             if (++buf_ptr >= buf_end)
 238                 fill_buffer();
 239         }
 240         ps.its_a_keyword = false;
 241         ps.sizeof_keyword = false;
 242         if (l_struct && !ps.p_l_follow) {
 243                                 /* if last token was 'struct' and we're not
 244                                  * in parentheses, then this token
 245                                  * should be treated as a declaration */
 246             l_struct = false;
 247             last_code = ident;
 248             ps.last_u_d = true;
 249             return (decl);
 250         }
 251         ps.last_u_d = l_struct; /* Operator after identifier is binary
 252                                  * unless last token was 'struct' */
 253         l_struct = false;
 254         last_code = ident;      /* Remember that this is the code we will
 255                                  * return */
 256
 257         if (auto_typedefs) {
 258             const char *q = s_token;
 259             size_t q_len = strlen(q);
 260             /* Check if we have an "_t" in the end */
 261             if (q_len > 2 &&
 262                 (strcmp(q + q_len - 2, "_t") == 0)) {
 263                 ps.its_a_keyword = true;
 264                 ps.last_u_d = true;
 265                 goto found_auto_typedef;
 266             }
 267         }
 268
 269         /*
 270          * This loop will check if the token is a keyword.
 271          */
 272         for (p = specials; (j = p->rwd) != 0; p++) {
 273             const char *q = s_token;    /* point at scanned token */
 274             if (*j++ != *q++ || *j++ != *q++)
 275                 continue;       /* This test depends on the fact that
 276                                  * identifiers are always at least 1 character
 277                                  * long (ie. the first two bytes of the
 278                                  * identifier are always meaningful) */
 279             if (q[-1] == 0)
 280                 break;          /* If its a one-character identifier */
 281             while (*q++ == *j)
 282                 if (*j++ == 0)
 283                     goto found_keyword; /* I wish that C had a multi-level
 284                                          * break... */
 285         }
 286         if (p->rwd) {           /* we have a keyword */
 287     found_keyword:
 288             ps.its_a_keyword = true;
 289             ps.last_u_d = true;
 290             switch (p->rwcode) {
 291             case 1:             /* it is a switch */
 292                 return (swstmt);
 293             case 2:             /* a case or default */
 294                 return (casestmt);
 295
 296             case 3:             /* a "struct" */
 297                 /*
 298                  * Next time around, we will want to know that we have had a
 299                  * 'struct'
 300                  */
 301                 l_struct = true;
 302                 /* FALLTHROUGH */
 303
 304             case 4:             /* one of the declaration keywords */
 305             found_auto_typedef:
 306                 if (ps.p_l_follow) {
 307                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
 308                     break;      /* inside parens: cast, param list or sizeof */
 309                 }
 310                 last_code = decl;
 311                 return (decl);
 312
 313             case 5:             /* if, while, for */
 314                 return (sp_paren);
 315
 316             case 6:             /* do, else */
 317                 return (sp_nparen);
 318
 319             case 7:
 320                 ps.sizeof_keyword = true;
 321             default:            /* all others are treated like any other
 322                                  * identifier */
 323                 return (ident);
 324             }                   /* end of switch */
 325         }                       /* end of if (found_it) */
 326         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 327             char *tp = buf_ptr;
 328             while (tp < buf_end)
 329                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 330                     goto not_proc;
 331             strncpy(ps.procname, token, sizeof ps.procname - 1);
 332             ps.in_parameter_declaration = 1;
 333             rparen_count = 1;
 334     not_proc:;
 335         }
 336         /*
 337          * The following hack attempts to guess whether or not the current
 338          * token is in fact a declaration keyword -- one that has been
 339          * typedefd
 340          */
 341         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 342                 && !ps.p_l_follow
 343                 && !ps.block_init
 344                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 345                     ps.last_token == decl ||
 346                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 347             ps.its_a_keyword = true;
 348             ps.last_u_d = true;
 349             last_code = decl;
 350             return decl;
 351         }
 352         if (last_code == decl)  /* if this is a declared variable, then
 353                                  * following sign is unary */
 354             ps.last_u_d = true; /* will make "int a -1" work */
 355         last_code = ident;
 356         return (ident);         /* the ident is not in the list */
 357     }                           /* end of procesing for alpanum character */
 358
 359     /* Scan a non-alphanumeric token */
 360
 361     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 362                                  * moved here */
 363     *e_token = '\0';
 364     if (++buf_ptr >= buf_end)
 365         fill_buffer();
 366
 367     switch (*token) {
 368     case '\n':
 369         unary_delim = ps.last_u_d;
 370         ps.last_nl = true;      /* remember that we just had a newline */
 371         code = (had_eof ? 0 : newline);
 372
 373         /*
 374          * if data has been exhausted, the newline is a dummy, and we should
 375          * return code to stop
 376          */
 377         break;
 378
 379     case '\'':                  /* start of quoted character */
 380     case '"':                   /* start of string */
 381         qchar = *token;
 382         if (troff) {
 383             e_token[-1] = '`';
 384             if (qchar == '"')
 385                 *e_token++ = '`';
 386             e_token = chfont(&bodyf, &stringf, e_token);
 387         }
 388         do {                    /* copy the string */
 389             while (1) {         /* move one character or [/<char>]<char> */
 390                 if (*buf_ptr == '\n') {
 391                     diag2(1, "Unterminated literal");
 392                     goto stop_lit;
 393                 }
 394                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 395                                          * since CHECK_SIZE guarantees that there
 396                                          * are at least 5 entries left */
 397                 *e_token = *buf_ptr++;
 398                 if (buf_ptr >= buf_end)
 399                     fill_buffer();
 400                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 401                     if (*buf_ptr == '\n')       /* check for escaped newline */
 402                         ++line_no;
 403                     if (troff) {
 404                         *++e_token = BACKSLASH;
 405                         if (*buf_ptr == BACKSLASH)
 406                             *++e_token = BACKSLASH;
 407                     }
 408                     *++e_token = *buf_ptr++;
 409                     ++e_token;  /* we must increment this again because we
 410                                  * copied two chars */
 411                     if (buf_ptr >= buf_end)
 412                         fill_buffer();
 413                 }
 414                 else
 415                     break;      /* we copied one character */
 416             }                   /* end of while (1) */
 417         } while (*e_token++ != qchar);
 418         if (troff) {
 419             e_token = chfont(&stringf, &bodyf, e_token - 1);
 420             if (qchar == '"')
 421                 *e_token++ = '\'';
 422         }
 423 stop_lit:
 424         code = ident;
 425         break;
 426
 427     case ('('):
 428     case ('['):
 429         unary_delim = true;
 430         code = lparen;
 431         break;
 432
 433     case (')'):
 434     case (']'):
 435         code = rparen;
 436         break;
 437
 438     case '#':
 439         unary_delim = ps.last_u_d;
 440         code = preesc;
 441         break;
 442
 443     case '?':
 444         unary_delim = true;
 445         code = question;
 446         break;
 447
 448     case (':'):
 449         code = colon;
 450         unary_delim = true;
 451         break;
 452
 453     case (';'):
 454         unary_delim = true;
 455         code = semicolon;
 456         break;
 457
 458     case ('{'):
 459         unary_delim = true;
 460
 461         /*
 462          * if (ps.in_or_st) ps.block_init = 1;
 463          */
 464         /* ?    code = ps.block_init ? lparen : lbrace; */
 465         code = lbrace;
 466         break;
 467
 468     case ('}'):
 469         unary_delim = true;
 470         /* ?    code = ps.block_init ? rparen : rbrace; */
 471         code = rbrace;
 472         break;
 473
 474     case 014:                   /* a form feed */
 475         unary_delim = ps.last_u_d;
 476         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 477                                  * right */
 478         code = form_feed;
 479         break;
 480
 481     case (','):
 482         unary_delim = true;
 483         code = comma;
 484         break;
 485
 486     case '.':
 487         unary_delim = false;
 488         code = period;
 489         break;
 490
 491     case '-':
 492     case '+':                   /* check for -, +, --, ++ */
 493         code = (ps.last_u_d ? unary_op : binary_op);
 494         unary_delim = true;
 495
 496         if (*buf_ptr == token[0]) {
 497             /* check for doubled character */
 498             *e_token++ = *buf_ptr++;
 499             /* buffer overflow will be checked at end of loop */
 500             if (last_code == ident || last_code == rparen) {
 501                 code = (ps.last_u_d ? unary_op : postop);
 502                 /* check for following ++ or -- */
 503                 unary_delim = false;
 504             }
 505         }
 506         else if (*buf_ptr == '=')
 507             /* check for operator += */
 508             *e_token++ = *buf_ptr++;
 509         else if (*buf_ptr == '>') {
 510             /* check for operator -> */
 511             *e_token++ = *buf_ptr++;
 512             if (!pointer_as_binop) {
 513                 unary_delim = false;
 514                 code = unary_op;
 515                 ps.want_blank = false;
 516             }
 517         }
 518         break;                  /* buffer overflow will be checked at end of
 519                                  * switch */
 520
 521     case '=':
 522         if (ps.in_or_st)
 523             ps.block_init = 1;
 524 #ifdef undef
 525         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 526             e_token[-1] = *buf_ptr++;
 527             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 528                 *e_token++ = *buf_ptr++;
 529             *e_token++ = '=';   /* Flip =+ to += */
 530             *e_token = 0;
 531         }
 532 #else
 533         if (*buf_ptr == '=') {/* == */
 534             *e_token++ = '=';   /* Flip =+ to += */
 535             buf_ptr++;
 536             *e_token = 0;
 537         }
 538 #endif
 539         code = binary_op;
 540         unary_delim = true;
 541         break;
 542         /* can drop thru!!! */
 543
 544     case '>':
 545     case '<':
 546     case '!':                   /* ops like <, <<, <=, !=, etc */
 547         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 548             *e_token++ = *buf_ptr;
 549             if (++buf_ptr >= buf_end)
 550                 fill_buffer();
 551         }
 552         if (*buf_ptr == '=')
 553             *e_token++ = *buf_ptr++;
 554         code = (ps.last_u_d ? unary_op : binary_op);
 555         unary_delim = true;
 556         break;
 557
 558     default:
 559         if (token[0] == '/' && *buf_ptr == '*') {
 560             /* it is start of comment */
 561             *e_token++ = '*';
 562
 563             if (++buf_ptr >= buf_end)
 564                 fill_buffer();
 565
 566             code = comment;
 567             unary_delim = ps.last_u_d;
 568             break;
 569         }
 570         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 571             /*
 572              * handle ||, &&, etc, and also things as in int *****i
 573              */
 574             *e_token++ = *buf_ptr;
 575             if (++buf_ptr >= buf_end)
 576                 fill_buffer();
 577         }
 578         code = (ps.last_u_d ? unary_op : binary_op);
 579         unary_delim = true;
 580
 581
 582     }                           /* end of switch */
 583     if (code != newline) {
 584         l_struct = false;
 585         last_code = code;
 586     }
 587     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 588         fill_buffer();
 589     ps.last_u_d = unary_delim;
 590     *e_token = '\0';            /* null terminate the token */
 591     return (code);
 592 }
 593
 594 /*
 595  * Add the given keyword to the keyword table, using val as the keyword type
 596  */
 597 void
 598 addkey(char *key, int val)
 599 {
 600     struct templ *p = specials;
 601     while (p->rwd)
 602         if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 603             return;
 604         else
 605             p++;
 606     if (p >= specials + sizeof specials / sizeof specials[0])
 607         return;                 /* For now, table overflows are silently
 608                                  * ignored */
 609     p->rwd = key;
 610     p->rwcode = val;
 611     p[1].rwd = 0;
 612     p[1].rwcode = 0;
 613 }