contrib/bc/src/bc_lex.c

   1 /*
   2  * *****************************************************************************
   3  *
   4  * SPDX-License-Identifier: BSD-2-Clause
   5  *
   6  * Copyright (c) 2018-2023 Gavin D. Howard and contributors.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions are met:
  10  *
  11  * * Redistributions of source code must retain the above copyright notice, this
  12  *   list of conditions and the following disclaimer.
  13  *
  14  * * Redistributions in binary form must reproduce the above copyright notice,
  15  *   this list of conditions and the following disclaimer in the documentation
  16  *   and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28  * POSSIBILITY OF SUCH DAMAGE.
  29  *
  30  * *****************************************************************************
  31  *
  32  * The lexer for bc.
  33  *
  34  */
  35
  36 #if BC_ENABLED
  37
  38 #include <assert.h>
  39 #include <ctype.h>
  40 #include <string.h>
  41
  42 #include <bc.h>
  43 #include <vm.h>
  44
  45 /**
  46  * Lexes an identifier, which may be a keyword.
  47  * @param l  The lexer.
  48  */
  49 static void
  50 bc_lex_identifier(BcLex* l)
  51 {
  52         // We already passed the first character, so we need to be sure to include
  53         // it.
  54         const char* buf = l->buf + l->i - 1;
  55         size_t i;
  56
  57         // This loop is simply checking for keywords.
  58         for (i = 0; i < bc_lex_kws_len; ++i)
  59         {
  60                 const BcLexKeyword* kw = bc_lex_kws + i;
  61                 size_t n = BC_LEX_KW_LEN(kw);
  62
  63                 if (!strncmp(buf, kw->name, n) && !isalnum(buf[n]) && buf[n] != '_')
  64                 {
  65                         // If the keyword has been redefined, and redefinition is allowed
  66                         // (it is not allowed for builtin libraries), break out of the loop
  67                         // and use it as a name. This depends on the argument parser to
  68                         // ensure that only non-POSIX keywords get redefined.
  69                         if (!vm->no_redefine && vm->redefined_kws[i]) break;
  70
  71                         l->t = BC_LEX_KW_AUTO + (BcLexType) i;
  72
  73                         // Warn or error, as appropriate for the mode, if the keyword is not
  74                         // in the POSIX standard.
  75                         if (!BC_LEX_KW_POSIX(kw)) bc_lex_verr(l, BC_ERR_POSIX_KW, kw->name);
  76
  77                         // We minus 1 because the index has already been incremented.
  78                         l->i += n - 1;
  79
  80                         // Already have the token; bail.
  81                         return;
  82                 }
  83         }
  84
  85         // If not a keyword, parse the name.
  86         bc_lex_name(l);
  87
  88         // POSIX doesn't allow identifiers that are more than one character, so we
  89         // might have to warn or error here too.
  90         if (BC_ERR(l->str.len - 1 > 1))
  91         {
  92                 bc_lex_verr(l, BC_ERR_POSIX_NAME_LEN, l->str.v);
  93         }
  94 }
  95
  96 /**
  97  * Parses a bc string. This is separate from dc strings because dc strings need
  98  * to be balanced.
  99  * @param l  The lexer.
 100  */
 101 static void
 102 bc_lex_string(BcLex* l)
 103 {
 104         // We need to keep track of newlines to increment them properly.
 105         size_t len, nlines, i;
 106         const char* buf;
 107         char c;
 108         bool got_more;
 109
 110         l->t = BC_LEX_STR;
 111
 112         do
 113         {
 114                 nlines = 0;
 115                 buf = l->buf;
 116                 got_more = false;
 117
 118                 assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
 119
 120                 // Fortunately for us, bc doesn't escape quotes. Instead, the equivalent
 121                 // is '\q', which makes this loop simpler.
 122                 for (i = l->i; (c = buf[i]) && c != '"'; ++i)
 123                 {
 124                         nlines += (c == '\n');
 125                 }
 126
 127                 if (BC_ERR(c == '\0') && !vm->eof && l->mode != BC_MODE_FILE)
 128                 {
 129                         got_more = bc_lex_readLine(l);
 130                 }
 131         }
 132         while (got_more && c != '"');
 133
 134         // If the string did not end properly, barf.
 135         if (c != '"')
 136         {
 137                 l->i = i;
 138                 bc_lex_err(l, BC_ERR_PARSE_STRING);
 139         }
 140
 141         // Set the temp string to the parsed string.
 142         len = i - l->i;
 143         bc_vec_string(&l->str, len, l->buf + l->i);
 144
 145         l->i = i + 1;
 146         l->line += nlines;
 147 }
 148
 149 /**
 150  * This function takes a lexed operator and checks to see if it's the assignment
 151  * version, setting the token appropriately.
 152  * @param l        The lexer.
 153  * @param with     The token to assign if it is an assignment operator.
 154  * @param without  The token to assign if it is not an assignment operator.
 155  */
 156 static void
 157 bc_lex_assign(BcLex* l, BcLexType with, BcLexType without)
 158 {
 159         if (l->buf[l->i] == '=')
 160         {
 161                 l->i += 1;
 162                 l->t = with;
 163         }
 164         else l->t = without;
 165 }
 166
 167 void
 168 bc_lex_token(BcLex* l)
 169 {
 170         // We increment here. This means that all lexing needs to take that into
 171         // account, such as when parsing an identifier. If we don't, the first
 172         // character of every identifier would be missing.
 173         char c = l->buf[l->i++], c2;
 174
 175         BC_SIG_ASSERT_LOCKED;
 176
 177         // This is the workhorse of the lexer.
 178         switch (c)
 179         {
 180                 case '\0':
 181                 case '\n':
 182                 case '\t':
 183                 case '\v':
 184                 case '\f':
 185                 case '\r':
 186                 case ' ':
 187                 {
 188                         bc_lex_commonTokens(l, c);
 189                         break;
 190                 }
 191
 192                 case '!':
 193                 {
 194                         // Even though it's not an assignment, we can use this.
 195                         bc_lex_assign(l, BC_LEX_OP_REL_NE, BC_LEX_OP_BOOL_NOT);
 196
 197                         // POSIX doesn't allow boolean not.
 198                         if (l->t == BC_LEX_OP_BOOL_NOT)
 199                         {
 200                                 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "!");
 201                         }
 202
 203                         break;
 204                 }
 205
 206                 case '"':
 207                 {
 208                         bc_lex_string(l);
 209                         break;
 210                 }
 211
 212                 case '#':
 213                 {
 214                         // POSIX does not allow line comments.
 215                         bc_lex_err(l, BC_ERR_POSIX_COMMENT);
 216                         bc_lex_lineComment(l);
 217                         break;
 218                 }
 219
 220                 case '%':
 221                 {
 222                         bc_lex_assign(l, BC_LEX_OP_ASSIGN_MODULUS, BC_LEX_OP_MODULUS);
 223                         break;
 224                 }
 225
 226                 case '&':
 227                 {
 228                         c2 = l->buf[l->i];
 229
 230                         // Either we have boolean and or an error. And boolean and is not
 231                         // allowed by POSIX.
 232                         if (BC_NO_ERR(c2 == '&'))
 233                         {
 234                                 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "&&");
 235
 236                                 l->i += 1;
 237                                 l->t = BC_LEX_OP_BOOL_AND;
 238                         }
 239                         else bc_lex_invalidChar(l, c);
 240
 241                         break;
 242                 }
 243 #if BC_ENABLE_EXTRA_MATH
 244                 case '$':
 245                 {
 246                         l->t = BC_LEX_OP_TRUNC;
 247                         break;
 248                 }
 249
 250                 case '@':
 251                 {
 252                         bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLACES, BC_LEX_OP_PLACES);
 253                         break;
 254                 }
 255 #endif // BC_ENABLE_EXTRA_MATH
 256                 case '(':
 257                 case ')':
 258                 {
 259                         l->t = (BcLexType) (c - '(' + BC_LEX_LPAREN);
 260                         break;
 261                 }
 262
 263                 case '*':
 264                 {
 265                         bc_lex_assign(l, BC_LEX_OP_ASSIGN_MULTIPLY, BC_LEX_OP_MULTIPLY);
 266                         break;
 267                 }
 268
 269                 case '+':
 270                 {
 271                         c2 = l->buf[l->i];
 272
 273                         // Have to check for increment first.
 274                         if (c2 == '+')
 275                         {
 276                                 l->i += 1;
 277                                 l->t = BC_LEX_OP_INC;
 278                         }
 279                         else bc_lex_assign(l, BC_LEX_OP_ASSIGN_PLUS, BC_LEX_OP_PLUS);
 280                         break;
 281                 }
 282
 283                 case ',':
 284                 {
 285                         l->t = BC_LEX_COMMA;
 286                         break;
 287                 }
 288
 289                 case '-':
 290                 {
 291                         c2 = l->buf[l->i];
 292
 293                         // Have to check for decrement first.
 294                         if (c2 == '-')
 295                         {
 296                                 l->i += 1;
 297                                 l->t = BC_LEX_OP_DEC;
 298                         }
 299                         else bc_lex_assign(l, BC_LEX_OP_ASSIGN_MINUS, BC_LEX_OP_MINUS);
 300                         break;
 301                 }
 302
 303                 case '.':
 304                 {
 305                         c2 = l->buf[l->i];
 306
 307                         // If it's alone, it's an alias for last.
 308                         if (BC_LEX_NUM_CHAR(c2, true, false)) bc_lex_number(l, c);
 309                         else
 310                         {
 311                                 l->t = BC_LEX_KW_LAST;
 312                                 bc_lex_err(l, BC_ERR_POSIX_DOT);
 313                         }
 314
 315                         break;
 316                 }
 317
 318                 case '/':
 319                 {
 320                         c2 = l->buf[l->i];
 321                         if (c2 == '*') bc_lex_comment(l);
 322                         else bc_lex_assign(l, BC_LEX_OP_ASSIGN_DIVIDE, BC_LEX_OP_DIVIDE);
 323                         break;
 324                 }
 325
 326                 case '0':
 327                 case '1':
 328                 case '2':
 329                 case '3':
 330                 case '4':
 331                 case '5':
 332                 case '6':
 333                 case '7':
 334                 case '8':
 335                 case '9':
 336                 case 'A':
 337                 case 'B':
 338                 case 'C':
 339                 case 'D':
 340                 case 'E':
 341                 case 'F':
 342                 // Apparently, GNU bc (and maybe others) allows any uppercase letter as
 343                 // a number. When single digits, they act like the ones above. When
 344                 // multi-digit, any letter above the input base is automatically set to
 345                 // the biggest allowable digit in the input base.
 346                 case 'G':
 347                 case 'H':
 348                 case 'I':
 349                 case 'J':
 350                 case 'K':
 351                 case 'L':
 352                 case 'M':
 353                 case 'N':
 354                 case 'O':
 355                 case 'P':
 356                 case 'Q':
 357                 case 'R':
 358                 case 'S':
 359                 case 'T':
 360                 case 'U':
 361                 case 'V':
 362                 case 'W':
 363                 case 'X':
 364                 case 'Y':
 365                 case 'Z':
 366                 {
 367                         bc_lex_number(l, c);
 368                         break;
 369                 }
 370
 371                 case ';':
 372                 {
 373                         l->t = BC_LEX_SCOLON;
 374                         break;
 375                 }
 376
 377                 case '<':
 378                 {
 379 #if BC_ENABLE_EXTRA_MATH
 380                         c2 = l->buf[l->i];
 381
 382                         // Check for shift.
 383                         if (c2 == '<')
 384                         {
 385                                 l->i += 1;
 386                                 bc_lex_assign(l, BC_LEX_OP_ASSIGN_LSHIFT, BC_LEX_OP_LSHIFT);
 387                                 break;
 388                         }
 389 #endif // BC_ENABLE_EXTRA_MATH
 390                         bc_lex_assign(l, BC_LEX_OP_REL_LE, BC_LEX_OP_REL_LT);
 391                         break;
 392                 }
 393
 394                 case '=':
 395                 {
 396                         bc_lex_assign(l, BC_LEX_OP_REL_EQ, BC_LEX_OP_ASSIGN);
 397                         break;
 398                 }
 399
 400                 case '>':
 401                 {
 402 #if BC_ENABLE_EXTRA_MATH
 403                         c2 = l->buf[l->i];
 404
 405                         // Check for shift.
 406                         if (c2 == '>')
 407                         {
 408                                 l->i += 1;
 409                                 bc_lex_assign(l, BC_LEX_OP_ASSIGN_RSHIFT, BC_LEX_OP_RSHIFT);
 410                                 break;
 411                         }
 412 #endif // BC_ENABLE_EXTRA_MATH
 413                         bc_lex_assign(l, BC_LEX_OP_REL_GE, BC_LEX_OP_REL_GT);
 414                         break;
 415                 }
 416
 417                 case '[':
 418                 case ']':
 419                 {
 420                         l->t = (BcLexType) (c - '[' + BC_LEX_LBRACKET);
 421                         break;
 422                 }
 423
 424                 case '\\':
 425                 {
 426                         // In bc, a backslash+newline is whitespace.
 427                         if (BC_NO_ERR(l->buf[l->i] == '\n'))
 428                         {
 429                                 l->i += 1;
 430                                 l->t = BC_LEX_WHITESPACE;
 431                         }
 432                         else bc_lex_invalidChar(l, c);
 433                         break;
 434                 }
 435
 436                 case '^':
 437                 {
 438                         bc_lex_assign(l, BC_LEX_OP_ASSIGN_POWER, BC_LEX_OP_POWER);
 439                         break;
 440                 }
 441
 442                 case 'a':
 443                 case 'b':
 444                 case 'c':
 445                 case 'd':
 446                 case 'e':
 447                 case 'f':
 448                 case 'g':
 449                 case 'h':
 450                 case 'i':
 451                 case 'j':
 452                 case 'k':
 453                 case 'l':
 454                 case 'm':
 455                 case 'n':
 456                 case 'o':
 457                 case 'p':
 458                 case 'q':
 459                 case 'r':
 460                 case 's':
 461                 case 't':
 462                 case 'u':
 463                 case 'v':
 464                 case 'w':
 465                 case 'x':
 466                 case 'y':
 467                 case 'z':
 468                 {
 469                         bc_lex_identifier(l);
 470                         break;
 471                 }
 472
 473                 case '{':
 474                 case '}':
 475                 {
 476                         l->t = (BcLexType) (c - '{' + BC_LEX_LBRACE);
 477                         break;
 478                 }
 479
 480                 case '|':
 481                 {
 482                         c2 = l->buf[l->i];
 483
 484                         // Once again, boolean or is not allowed by POSIX.
 485                         if (BC_NO_ERR(c2 == '|'))
 486                         {
 487                                 bc_lex_verr(l, BC_ERR_POSIX_BOOL, "||");
 488
 489                                 l->i += 1;
 490                                 l->t = BC_LEX_OP_BOOL_OR;
 491                         }
 492                         else bc_lex_invalidChar(l, c);
 493
 494                         break;
 495                 }
 496
 497                 default:
 498                 {
 499                         bc_lex_invalidChar(l, c);
 500                 }
 501         }
 502 }
 503 #endif // BC_ENABLED