contrib/bc/src/bc_lex.c

   1 /*
   2  * *****************************************************************************
   3  *
   4  * SPDX-License-Identifier: BSD-2-Clause
   5  *
   6  * Copyright (c) 2018-2021 Gavin D. Howard and contributors.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions are met:
  10  *
  11  * * Redistributions of source code must retain the above copyright notice, this
  12  *   list of conditions and the following disclaimer.
  13  *
  14  * * Redistributions in binary form must reproduce the above copyright notice,
  15  *   this list of conditions and the following disclaimer in the documentation
  16  *   and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28  * POSSIBILITY OF SUCH DAMAGE.
  29  *
  30  * *****************************************************************************
  31  *
  32  * The lexer for bc.
  33  *
  34  */
  35
  36 #if BC_ENABLED
  37
  38 #include <assert.h>
  39 #include <ctype.h>
  40 #include <string.h>
  41
  42 #include <bc.h>
  43 #include <vm.h>
  44
  45 /**
  46  * Lexes an identifier, which may be a keyword.
  47  * @param l  The lexer.
  48  */
  49 static void bc_lex_identifier(BcLex *l) {
  50
  51         // We already passed the first character, so we need to be sure to include
  52         // it.
  53         const char *buf = l->buf + l->i - 1;
  54         size_t i;
  55
  56         // This loop is simply checking for keywords.
  57         for (i = 0; i < bc_lex_kws_len; ++i) {
  58
  59                 const BcLexKeyword *kw = bc_lex_kws + i;
  60                 size_t n = BC_LEX_KW_LEN(kw);
  61
  62                 if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_') {
  63
  64                         // If the keyword has been redefined, and redefinition is allowed
  65                         // (it is not allowed for builtin libraries), break out of the loop
  66                         // and use it as a name. This depends on the argument parser to
  67                         // ensure that only non-POSIX keywords get redefined.
  68                         if (!vm.no_redefine && vm.redefined_kws[i]) break;
  69
  70                         l->t = BC_LEX_KW_AUTO + (BcLexType) i;
  71
  72                         // Warn or error, as appropriate for the mode, if the keyword is not
  73                         // in the POSIX standard.
  74                         if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);
  75
  76                         // We minus 1 because the index has already been incremented.
  77                         l->i += n - 1;
  78
  79                         // Already have the token; bail.
  80                         return;
  81                 }
  82         }
  83
  84         // If not a keyword, parse the name.
  85         bc_lex_name(l);
  86
  87         // POSIX doesn't allow identifiers that are more than one character, so we
  88         // might have to warn or error here too.
  89         if (BC_ERR(l->str.len - 1 > 1))
  90                 bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);
  91 }
  92
  93 /**
  94  * Parses a bc string. This is separate from dc strings because dc strings need
  95  * to be balanced.
  96  * @param l  The lexer.
  97  */
  98 static void bc_lex_string(BcLex *l) {
  99
 100         // We need to keep track of newlines to increment them properly.
 101         size_t len, nlines, i;
 102         const char *buf;
 103         char c;
 104         bool got_more;
 105
 106         l->t = BC_LEX_STR;
 107
 108         do {
 109
 110                 nlines = 0;
 111                 buf = l->buf;
 112                 got_more = false;
 113
 114                 assert(!vm.is_stdin || buf == vm.buffer.v);
 115
 116                 // Fortunately for us, bc doesn't escape quotes. Instead, the equivalent
 117                 // is '\q', which makes this loop simpler.
 118                 for (i = l->i; (c = buf[i]) && c != '"'; ++i) nlines += (c == '\n');
 119
 120                 if (BC_ERR(c == '\0') && !vm.eof && l->is_stdin)
 121                         got_more = bc_lex_readLine(l);
 122
 123         } while (got_more && c != '"');
 124
 125         // If the string did not end properly, barf.
 126         if (c != '"') {
 127                 l->i = i;
 128                 bc_lex_err(l, BC_ERR_PARSE_STRING);
 129         }
 130
 131         // Set the temp string to the parsed string.
 132         len = i - l->i;
 133         bc_vec_string(&l->str, len, l->buf + l->i);
 134
 135         l->i = i + 1;
 136         l->line += nlines;
 137 }
 138
 139 /**
 140  * This function takes a lexed operator and checks to see if it's the assignment
 141  * version, setting the token appropriately.
 142  * @param l        The lexer.
 143  * @param with     The token to assign if it is an assignment operator.
 144  * @param without  The token to assign if it is not an assignment operator.
 145  */
 146 static void bc_lex_assign(BcLex *l, BcLexType with, BcLexType without) {
 147         if (l->buf[l->i] == '=') {
 148                 l->i += 1;
 149                 l->t = with;
 150         }
 151         else l->t = without;
 152 }
 153
 154 void bc_lex_token(BcLex *l) {
 155
 156         // We increment here. This means that all lexing needs to take that into
 157         // account, such as when parsing an identifier. If we don't, the first
 158         // character of every identifier would be missing.
 159         char c = l->buf[l->i++], c2;
 160
 161         // This is the workhorse of the lexer.
 162         switch (c) {
 163
 164                 case '\0':
 165                 case '\n':
 166                 case '\t':
 167                 case '\v':
 168                 case '\f':
 169                 case '\r':
 170                 case ' ':
 171                 {
 172                         bc_lex_commonTokens(l, c);
 173                         break;
 174                 }
 175
 176                 case '!':
 177                 {
 178                         // Even though it's not an assignment, we can use this.
 179                         bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);
 180
 181                         // POSIX doesn't allow boolean not.
 182                         if (l->t == BC_LEX_OP_BOOL_NOT)
 183                                 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");
 184
 185                         break;
 186                 }
 187
 188                 case '"':
 189                 {
 190                         bc_lex_string(l);
 191                         break;
 192                 }
 193
 194                 case '#':
 195                 {
 196                         // POSIX does not allow line comments.
 197                         bc_lex_err(l, BC_ERR_POSIX_COMMENT);
 198                         bc_lex_lineComment(l);
 199                         break;
 200                 }
 201
 202                 case '%':
 203                 {
 204                         bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);
 205                         break;
 206                 }
 207
 208                 case '&':
 209                 {
 210                         c2 = l->buf[l->i];
 211
 212                         // Either we have boolean and or an error. And boolean and is not
 213                         // allowed by POSIX.
 214                         if (BC_NO_ERR(c2 == '&')) {
 215
 216                                 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");
 217
 218                                 l->i += 1;
 219                                 l->t = BC_LEX_OP_BOOL_AND;
 220                         }
 221                         else bc_lex_invalidChar(l, c);
 222
 223                         break;
 224                 }
 225 #if BC_ENABLE_EXTRA_MATH
 226                 case '$':
 227                 {
 228                         l->t = BC_LEX_OP_TRUNC;
 229                         break;
 230                 }
 231
 232                 case '@':
 233                 {
 234                         bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);
 235                         break;
 236                 }
 237 #endif // BC_ENABLE_EXTRA_MATH
 238                 case '(':
 239                 case ')':
 240                 {
 241                         l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);
 242                         break;
 243                 }
 244
 245                 case '*':
 246                 {
 247                         bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);
 248                         break;
 249                 }
 250
 251                 case '+':
 252                 {
 253                         c2 = l->buf[l->i];
 254
 255                         // Have to check for increment first.
 256                         if (c2 == '+') {
 257                                 l->i += 1;
 258                                 l->t = BC_LEX_OP_INC;
 259                         }
 260                         else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);
 261                         break;
 262                 }
 263
 264                 case ',':
 265                 {
 266                         l->t = BC_LEX_COMMA;
 267                         break;
 268                 }
 269
 270                 case '-':
 271                 {
 272                         c2 = l->buf[l->i];
 273
 274                         // Have to check for decrement first.
 275                         if (c2 == '-') {
 276                                 l->i += 1;
 277                                 l->t = BC_LEX_OP_DEC;
 278                         }
 279                         else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);
 280                         break;
 281                 }
 282
 283                 case '.':
 284                 {
 285                         c2 = l->buf[l->i];
 286
 287                         // If it's alone, it's an alias for last.
 288                         if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);
 289                         else {
 290                                 l->t = BC_LEX_KW_LAST;
 291                                 bc_lex_err(l, BC_ERR_POSIX_DOT);
 292                         }
 293
 294                         break;
 295                 }
 296
 297                 case '/':
 298                 {
 299                         c2 = l->buf[l->i];
 300                         if (c2 =='*') bc_lex_comment(l);
 301                         else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);
 302                         break;
 303                 }
 304
 305                 case '0':
 306                 case '1':
 307                 case '2':
 308                 case '3':
 309                 case '4':
 310                 case '5':
 311                 case '6':
 312                 case '7':
 313                 case '8':
 314                 case '9':
 315                 case 'A':
 316                 case 'B':
 317                 case 'C':
 318                 case 'D':
 319                 case 'E':
 320                 case 'F':
 321                 // Apparently, GNU bc (and maybe others) allows any uppercase letter as
 322                 // a number. When single digits, they act like the ones above. When
 323                 // multi-digit, any letter above the input base is automatically set to
 324                 // the biggest allowable digit in the input base.
 325                 case 'G':
 326                 case 'H':
 327                 case 'I':
 328                 case 'J':
 329                 case 'K':
 330                 case 'L':
 331                 case 'M':
 332                 case 'N':
 333                 case 'O':
 334                 case 'P':
 335                 case 'Q':
 336                 case 'R':
 337                 case 'S':
 338                 case 'T':
 339                 case 'U':
 340                 case 'V':
 341                 case 'W':
 342                 case 'X':
 343                 case 'Y':
 344                 case 'Z':
 345                 {
 346                         bc_lex_number(l, c);
 347                         break;
 348                 }
 349
 350                 case ';':
 351                 {
 352                         l->t = BC_LEX_SCOLON;
 353                         break;
 354                 }
 355
 356                 case '<':
 357                 {
 358 #if BC_ENABLE_EXTRA_MATH
 359                         c2 = l->buf[l->i];
 360
 361                         // Check for shift.
 362                         if (c2 == '<') {
 363                                 l->i += 1;
 364                                 bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);
 365                                 break;
 366                         }
 367 #endif // BC_ENABLE_EXTRA_MATH
 368                         bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);
 369                         break;
 370                 }
 371
 372                 case '=':
 373                 {
 374                         bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);
 375                         break;
 376                 }
 377
 378                 case '>':
 379                 {
 380 #if BC_ENABLE_EXTRA_MATH
 381                         c2 = l->buf[l->i];
 382
 383                         // Check for shift.
 384                         if (c2 == '>') {
 385                                 l->i += 1;
 386                                 bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);
 387                                 break;
 388                         }
 389 #endif // BC_ENABLE_EXTRA_MATH
 390                         bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);
 391                         break;
 392                 }
 393
 394                 case '[':
 395                 case ']':
 396                 {
 397                         l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);
 398                         break;
 399                 }
 400
 401                 case '\\':
 402                 {
 403                         // In bc, a backslash+newline is whitespace.
 404                         if (BC_NO_ERR(l->buf[l->i] == '\n')) {
 405                                 l->i += 1;
 406                                 l->t = BC_LEX_WHITESPACE;
 407                         }
 408                         else bc_lex_invalidChar(l, c);
 409                         break;
 410                 }
 411
 412                 case '^':
 413                 {
 414                         bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);
 415                         break;
 416                 }
 417
 418                 case 'a':
 419                 case 'b':
 420                 case 'c':
 421                 case 'd':
 422                 case 'e':
 423                 case 'f':
 424                 case 'g':
 425                 case 'h':
 426                 case 'i':
 427                 case 'j':
 428                 case 'k':
 429                 case 'l':
 430                 case 'm':
 431                 case 'n':
 432                 case 'o':
 433                 case 'p':
 434                 case 'q':
 435                 case 'r':
 436                 case 's':
 437                 case 't':
 438                 case 'u':
 439                 case 'v':
 440                 case 'w':
 441                 case 'x':
 442                 case 'y':
 443                 case 'z':
 444                 {
 445                         bc_lex_identifier(l);
 446                         break;
 447                 }
 448
 449                 case '{':
 450                 case '}':
 451                 {
 452                         l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);
 453                         break;
 454                 }
 455
 456                 case '|':
 457                 {
 458                         c2 = l->buf[l->i];
 459
 460                         // Once again, boolean or is not allowed by POSIX.
 461                         if (BC_NO_ERR(c2 == '|')) {
 462
 463                                 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");
 464
 465                                 l->i += 1;
 466                                 l->t = BC_LEX_OP_BOOL_OR;
 467                         }
 468                         else bc_lex_invalidChar(l, c);
 469
 470                         break;
 471                 }
 472
 473                 default:
 474                 {
 475                         bc_lex_invalidChar(l, c);
 476                 }
 477         }
 478 }
 479 #endif // BC_ENABLED