usr.bin/indent/lexi.c

   1 /*
   2  * Copyright (c) 1985 Sun Microsystems, Inc.
   3  * Copyright (c) 1980, 1993
   4  *      The Regents of the University of California.  All rights reserved.
   5  * All rights reserved.
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  * 1. Redistributions of source code must retain the above copyright
  11  *    notice, this list of conditions and the following disclaimer.
  12  * 2. Redistributions in binary form must reproduce the above copyright
  13  *    notice, this list of conditions and the following disclaimer in the
  14  *    documentation and/or other materials provided with the distribution.
  15  * 3. All advertising materials mentioning features or use of this software
  16  *    must display the following acknowledgement:
  17  *      This product includes software developed by the University of
  18  *      California, Berkeley and its contributors.
  19  * 4. Neither the name of the University nor the names of its contributors
  20  *    may be used to endorse or promote products derived from this software
  21  *    without specific prior written permission.
  22  *
  23  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  33  * SUCH DAMAGE.
  34  */
  35
  36 #if 0
  37 #ifndef lint
  38 static char sccsid[] = "@(#)lexi.c      8.1 (Berkeley) 6/6/93";
  39 #endif /* not lint */
  40 #endif
  41 #include <sys/cdefs.h>
  42 __FBSDID("$FreeBSD$");
  43
  44 /*
  45  * Here we have the token scanner for indent.  It scans off one token and puts
  46  * it in the global variable "token".  It returns a code, indicating the type
  47  * of token scanned.
  48  */
  49
  50 #include <err.h>
  51 #include <stdio.h>
  52 #include <ctype.h>
  53 #include <stdlib.h>
  54 #include <string.h>
  55 #include "indent_globs.h"
  56 #include "indent_codes.h"
  57 #include "indent.h"
  58
  59 #define alphanum 1
  60 #define opchar 3
  61
  62 struct templ {
  63     const char *rwd;
  64     int         rwcode;
  65 };
  66
  67 struct templ specials[1000] =
  68 {
  69     {"switch", 1},
  70     {"case", 2},
  71     {"break", 0},
  72     {"struct", 3},
  73     {"union", 3},
  74     {"enum", 3},
  75     {"default", 2},
  76     {"int", 4},
  77     {"char", 4},
  78     {"float", 4},
  79     {"double", 4},
  80     {"long", 4},
  81     {"short", 4},
  82     {"typdef", 4},
  83     {"unsigned", 4},
  84     {"register", 4},
  85     {"static", 4},
  86     {"global", 4},
  87     {"extern", 4},
  88     {"void", 4},
  89     {"const", 4},
  90     {"volatile", 4},
  91     {"goto", 0},
  92     {"return", 0},
  93     {"if", 5},
  94     {"while", 5},
  95     {"for", 5},
  96     {"else", 6},
  97     {"do", 6},
  98     {"sizeof", 7},
  99     {0, 0}
 100 };
 101
 102 char        chartype[128] =
 103 {                               /* this is used to facilitate the decision of
 104                                  * what type (alphanumeric, operator) each
 105                                  * character is */
 106     0, 0, 0, 0, 0, 0, 0, 0,
 107     0, 0, 0, 0, 0, 0, 0, 0,
 108     0, 0, 0, 0, 0, 0, 0, 0,
 109     0, 0, 0, 0, 0, 0, 0, 0,
 110     0, 3, 0, 0, 1, 3, 3, 0,
 111     0, 0, 3, 3, 0, 3, 0, 3,
 112     1, 1, 1, 1, 1, 1, 1, 1,
 113     1, 1, 0, 0, 3, 3, 3, 3,
 114     0, 1, 1, 1, 1, 1, 1, 1,
 115     1, 1, 1, 1, 1, 1, 1, 1,
 116     1, 1, 1, 1, 1, 1, 1, 1,
 117     1, 1, 1, 0, 0, 0, 3, 1,
 118     0, 1, 1, 1, 1, 1, 1, 1,
 119     1, 1, 1, 1, 1, 1, 1, 1,
 120     1, 1, 1, 1, 1, 1, 1, 1,
 121     1, 1, 1, 0, 3, 0, 3, 0
 122 };
 123
 124 int
 125 lexi(void)
 126 {
 127     int         unary_delim;    /* this is set to 1 if the current token
 128                                  * forces a following operator to be unary */
 129     static int  last_code;      /* the last token type returned */
 130     static int  l_struct;       /* set to 1 if the last token was 'struct' */
 131     int         code;           /* internal code to be returned */
 132     char        qchar;          /* the delimiter character for a string */
 133
 134     e_token = s_token;          /* point to start of place to save token */
 135     unary_delim = false;
 136     ps.col_1 = ps.last_nl;      /* tell world that this token started in
 137                                  * column 1 iff the last thing scanned was nl */
 138     ps.last_nl = false;
 139
 140     while (*buf_ptr == ' ' || *buf_ptr == '\t') {       /* get rid of blanks */
 141         ps.col_1 = false;       /* leading blanks imply token is not in column
 142                                  * 1 */
 143         if (++buf_ptr >= buf_end)
 144             fill_buffer();
 145     }
 146
 147     /* Scan an alphanumeric token */
 148     if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 149         /*
 150          * we have a character or number
 151          */
 152         const char *j;          /* used for searching thru list of
 153                                  *
 154                                  * reserved words */
 155         struct templ *p;
 156
 157         if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
 158             int         seendot = 0,
 159                         seenexp = 0,
 160                         seensfx = 0;
 161             if (*buf_ptr == '0' &&
 162                     (buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
 163                 *e_token++ = *buf_ptr++;
 164                 *e_token++ = *buf_ptr++;
 165                 while (isxdigit(*buf_ptr)) {
 166                     CHECK_SIZE_TOKEN;
 167                     *e_token++ = *buf_ptr++;
 168                 }
 169             }
 170             else
 171                 while (1) {
 172                     if (*buf_ptr == '.') {
 173                         if (seendot)
 174                             break;
 175                         else
 176                             seendot++;
 177                     }
 178                     CHECK_SIZE_TOKEN;
 179                     *e_token++ = *buf_ptr++;
 180                     if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
 181                         if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
 182                             break;
 183                         else {
 184                             seenexp++;
 185                             seendot++;
 186                             CHECK_SIZE_TOKEN;
 187                             *e_token++ = *buf_ptr++;
 188                             if (*buf_ptr == '+' || *buf_ptr == '-')
 189                                 *e_token++ = *buf_ptr++;
 190                         }
 191                     }
 192                 }
 193             while (1) {
 194                 if (!(seensfx & 1) &&
 195                         (*buf_ptr == 'U' || *buf_ptr == 'u')) {
 196                     CHECK_SIZE_TOKEN;
 197                     *e_token++ = *buf_ptr++;
 198                     seensfx |= 1;
 199                     continue;
 200                 }
 201                 if (!(seensfx & 2) &&
 202                         (*buf_ptr == 'L' || *buf_ptr == 'l')) {
 203                     CHECK_SIZE_TOKEN;
 204                     if (buf_ptr[1] == buf_ptr[0])
 205                         *e_token++ = *buf_ptr++;
 206                     *e_token++ = *buf_ptr++;
 207                     seensfx |= 2;
 208                     continue;
 209                 }
 210                 break;
 211             }
 212         }
 213         else
 214             while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
 215                 /* fill_buffer() terminates buffer with newline */
 216                 if (*buf_ptr == BACKSLASH) {
 217                     if (*(buf_ptr + 1) == '\n') {
 218                         buf_ptr += 2;
 219                         if (buf_ptr >= buf_end)
 220                             fill_buffer();
 221                         } else
 222                             break;
 223                 }
 224                 CHECK_SIZE_TOKEN;
 225                 /* copy it over */
 226                 *e_token++ = *buf_ptr++;
 227                 if (buf_ptr >= buf_end)
 228                     fill_buffer();
 229             }
 230         *e_token++ = '\0';
 231         while (*buf_ptr == ' ' || *buf_ptr == '\t') {   /* get rid of blanks */
 232             if (++buf_ptr >= buf_end)
 233                 fill_buffer();
 234         }
 235         ps.its_a_keyword = false;
 236         ps.sizeof_keyword = false;
 237         if (l_struct && !ps.p_l_follow) {
 238                                 /* if last token was 'struct' and we're not
 239                                  * in parentheses, then this token
 240                                  * should be treated as a declaration */
 241             l_struct = false;
 242             last_code = ident;
 243             ps.last_u_d = true;
 244             return (decl);
 245         }
 246         ps.last_u_d = l_struct; /* Operator after identifier is binary
 247                                  * unless last token was 'struct' */
 248         l_struct = false;
 249         last_code = ident;      /* Remember that this is the code we will
 250                                  * return */
 251
 252         if (auto_typedefs) {
 253             const char *q = s_token;
 254             size_t q_len = strlen(q);
 255             /* Check if we have an "_t" in the end */
 256             if (q_len > 2 &&
 257                 (strcmp(q + q_len - 2, "_t") == 0)) {
 258                 ps.its_a_keyword = true;
 259                 ps.last_u_d = true;
 260                 goto found_auto_typedef;
 261             }
 262         }
 263
 264         /*
 265          * This loop will check if the token is a keyword.
 266          */
 267         for (p = specials; (j = p->rwd) != 0; p++) {
 268             const char *q = s_token;    /* point at scanned token */
 269             if (*j++ != *q++ || *j++ != *q++)
 270                 continue;       /* This test depends on the fact that
 271                                  * identifiers are always at least 1 character
 272                                  * long (ie. the first two bytes of the
 273                                  * identifier are always meaningful) */
 274             if (q[-1] == 0)
 275                 break;          /* If its a one-character identifier */
 276             while (*q++ == *j)
 277                 if (*j++ == 0)
 278                     goto found_keyword; /* I wish that C had a multi-level
 279                                          * break... */
 280         }
 281         if (p->rwd) {           /* we have a keyword */
 282     found_keyword:
 283             ps.its_a_keyword = true;
 284             ps.last_u_d = true;
 285             switch (p->rwcode) {
 286             case 1:             /* it is a switch */
 287                 return (swstmt);
 288             case 2:             /* a case or default */
 289                 return (casestmt);
 290
 291             case 3:             /* a "struct" */
 292                 /*
 293                  * Next time around, we will want to know that we have had a
 294                  * 'struct'
 295                  */
 296                 l_struct = true;
 297                 /* FALLTHROUGH */
 298
 299             case 4:             /* one of the declaration keywords */
 300             found_auto_typedef:
 301                 if (ps.p_l_follow) {
 302                     ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.sizeof_mask;
 303                     break;      /* inside parens: cast, param list or sizeof */
 304                 }
 305                 last_code = decl;
 306                 return (decl);
 307
 308             case 5:             /* if, while, for */
 309                 return (sp_paren);
 310
 311             case 6:             /* do, else */
 312                 return (sp_nparen);
 313
 314             case 7:
 315                 ps.sizeof_keyword = true;
 316             default:            /* all others are treated like any other
 317                                  * identifier */
 318                 return (ident);
 319             }                   /* end of switch */
 320         }                       /* end of if (found_it) */
 321         if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
 322             char *tp = buf_ptr;
 323             while (tp < buf_end)
 324                 if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 325                     goto not_proc;
 326             strncpy(ps.procname, token, sizeof ps.procname - 1);
 327             ps.in_parameter_declaration = 1;
 328             rparen_count = 1;
 329     not_proc:;
 330         }
 331         /*
 332          * The following hack attempts to guess whether or not the current
 333          * token is in fact a declaration keyword -- one that has been
 334          * typedefd
 335          */
 336         if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
 337                 && !ps.p_l_follow
 338                 && !ps.block_init
 339                 && (ps.last_token == rparen || ps.last_token == semicolon ||
 340                     ps.last_token == decl ||
 341                     ps.last_token == lbrace || ps.last_token == rbrace)) {
 342             ps.its_a_keyword = true;
 343             ps.last_u_d = true;
 344             last_code = decl;
 345             return decl;
 346         }
 347         if (last_code == decl)  /* if this is a declared variable, then
 348                                  * following sign is unary */
 349             ps.last_u_d = true; /* will make "int a -1" work */
 350         last_code = ident;
 351         return (ident);         /* the ident is not in the list */
 352     }                           /* end of procesing for alpanum character */
 353
 354     /* Scan a non-alphanumeric token */
 355
 356     *e_token++ = *buf_ptr;              /* if it is only a one-character token, it is
 357                                  * moved here */
 358     *e_token = '\0';
 359     if (++buf_ptr >= buf_end)
 360         fill_buffer();
 361
 362     switch (*token) {
 363     case '\n':
 364         unary_delim = ps.last_u_d;
 365         ps.last_nl = true;      /* remember that we just had a newline */
 366         code = (had_eof ? 0 : newline);
 367
 368         /*
 369          * if data has been exhausted, the newline is a dummy, and we should
 370          * return code to stop
 371          */
 372         break;
 373
 374     case '\'':                  /* start of quoted character */
 375     case '"':                   /* start of string */
 376         qchar = *token;
 377         if (troff) {
 378             e_token[-1] = '`';
 379             if (qchar == '"')
 380                 *e_token++ = '`';
 381             e_token = chfont(&bodyf, &stringf, e_token);
 382         }
 383         do {                    /* copy the string */
 384             while (1) {         /* move one character or [/<char>]<char> */
 385                 if (*buf_ptr == '\n') {
 386                     diag2(1, "Unterminated literal");
 387                     goto stop_lit;
 388                 }
 389                 CHECK_SIZE_TOKEN;       /* Only have to do this once in this loop,
 390                                          * since CHECK_SIZE guarantees that there
 391                                          * are at least 5 entries left */
 392                 *e_token = *buf_ptr++;
 393                 if (buf_ptr >= buf_end)
 394                     fill_buffer();
 395                 if (*e_token == BACKSLASH) {    /* if escape, copy extra char */
 396                     if (*buf_ptr == '\n')       /* check for escaped newline */
 397                         ++line_no;
 398                     if (troff) {
 399                         *++e_token = BACKSLASH;
 400                         if (*buf_ptr == BACKSLASH)
 401                             *++e_token = BACKSLASH;
 402                     }
 403                     *++e_token = *buf_ptr++;
 404                     ++e_token;  /* we must increment this again because we
 405                                  * copied two chars */
 406                     if (buf_ptr >= buf_end)
 407                         fill_buffer();
 408                 }
 409                 else
 410                     break;      /* we copied one character */
 411             }                   /* end of while (1) */
 412         } while (*e_token++ != qchar);
 413         if (troff) {
 414             e_token = chfont(&stringf, &bodyf, e_token - 1);
 415             if (qchar == '"')
 416                 *e_token++ = '\'';
 417         }
 418 stop_lit:
 419         code = ident;
 420         break;
 421
 422     case ('('):
 423     case ('['):
 424         unary_delim = true;
 425         code = lparen;
 426         break;
 427
 428     case (')'):
 429     case (']'):
 430         code = rparen;
 431         break;
 432
 433     case '#':
 434         unary_delim = ps.last_u_d;
 435         code = preesc;
 436         break;
 437
 438     case '?':
 439         unary_delim = true;
 440         code = question;
 441         break;
 442
 443     case (':'):
 444         code = colon;
 445         unary_delim = true;
 446         break;
 447
 448     case (';'):
 449         unary_delim = true;
 450         code = semicolon;
 451         break;
 452
 453     case ('{'):
 454         unary_delim = true;
 455
 456         /*
 457          * if (ps.in_or_st) ps.block_init = 1;
 458          */
 459         /* ?    code = ps.block_init ? lparen : lbrace; */
 460         code = lbrace;
 461         break;
 462
 463     case ('}'):
 464         unary_delim = true;
 465         /* ?    code = ps.block_init ? rparen : rbrace; */
 466         code = rbrace;
 467         break;
 468
 469     case 014:                   /* a form feed */
 470         unary_delim = ps.last_u_d;
 471         ps.last_nl = true;      /* remember this so we can set 'ps.col_1'
 472                                  * right */
 473         code = form_feed;
 474         break;
 475
 476     case (','):
 477         unary_delim = true;
 478         code = comma;
 479         break;
 480
 481     case '.':
 482         unary_delim = false;
 483         code = period;
 484         break;
 485
 486     case '-':
 487     case '+':                   /* check for -, +, --, ++ */
 488         code = (ps.last_u_d ? unary_op : binary_op);
 489         unary_delim = true;
 490
 491         if (*buf_ptr == token[0]) {
 492             /* check for doubled character */
 493             *e_token++ = *buf_ptr++;
 494             /* buffer overflow will be checked at end of loop */
 495             if (last_code == ident || last_code == rparen) {
 496                 code = (ps.last_u_d ? unary_op : postop);
 497                 /* check for following ++ or -- */
 498                 unary_delim = false;
 499             }
 500         }
 501         else if (*buf_ptr == '=')
 502             /* check for operator += */
 503             *e_token++ = *buf_ptr++;
 504         else if (*buf_ptr == '>') {
 505             /* check for operator -> */
 506             *e_token++ = *buf_ptr++;
 507             if (!pointer_as_binop) {
 508                 unary_delim = false;
 509                 code = unary_op;
 510                 ps.want_blank = false;
 511             }
 512         }
 513         break;                  /* buffer overflow will be checked at end of
 514                                  * switch */
 515
 516     case '=':
 517         if (ps.in_or_st)
 518             ps.block_init = 1;
 519 #ifdef undef
 520         if (chartype[*buf_ptr] == opchar) {     /* we have two char assignment */
 521             e_token[-1] = *buf_ptr++;
 522             if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
 523                 *e_token++ = *buf_ptr++;
 524             *e_token++ = '=';   /* Flip =+ to += */
 525             *e_token = 0;
 526         }
 527 #else
 528         if (*buf_ptr == '=') {/* == */
 529             *e_token++ = '=';   /* Flip =+ to += */
 530             buf_ptr++;
 531             *e_token = 0;
 532         }
 533 #endif
 534         code = binary_op;
 535         unary_delim = true;
 536         break;
 537         /* can drop thru!!! */
 538
 539     case '>':
 540     case '<':
 541     case '!':                   /* ops like <, <<, <=, !=, etc */
 542         if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
 543             *e_token++ = *buf_ptr;
 544             if (++buf_ptr >= buf_end)
 545                 fill_buffer();
 546         }
 547         if (*buf_ptr == '=')
 548             *e_token++ = *buf_ptr++;
 549         code = (ps.last_u_d ? unary_op : binary_op);
 550         unary_delim = true;
 551         break;
 552
 553     default:
 554         if (token[0] == '/' && *buf_ptr == '*') {
 555             /* it is start of comment */
 556             *e_token++ = '*';
 557
 558             if (++buf_ptr >= buf_end)
 559                 fill_buffer();
 560
 561             code = comment;
 562             unary_delim = ps.last_u_d;
 563             break;
 564         }
 565         while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
 566             /*
 567              * handle ||, &&, etc, and also things as in int *****i
 568              */
 569             *e_token++ = *buf_ptr;
 570             if (++buf_ptr >= buf_end)
 571                 fill_buffer();
 572         }
 573         code = (ps.last_u_d ? unary_op : binary_op);
 574         unary_delim = true;
 575
 576
 577     }                           /* end of switch */
 578     if (code != newline) {
 579         l_struct = false;
 580         last_code = code;
 581     }
 582     if (buf_ptr >= buf_end)     /* check for input buffer empty */
 583         fill_buffer();
 584     ps.last_u_d = unary_delim;
 585     *e_token = '\0';            /* null terminate the token */
 586     return (code);
 587 }
 588
 589 /*
 590  * Add the given keyword to the keyword table, using val as the keyword type
 591  */
 592 void
 593 addkey(char *key, int val)
 594 {
 595     struct templ *p = specials;
 596     while (p->rwd)
 597         if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
 598             return;
 599         else
 600             p++;
 601     if (p >= specials + sizeof specials / sizeof specials[0])
 602         return;                 /* For now, table overflows are silently
 603                                  * ignored */
 604     p->rwd = key;
 605     p->rwcode = val;
 606     p[1].rwd = 0;
 607     p[1].rwcode = 0;
 608 }