contrib/bc/src/lex.c

   1 /*
   2  * *****************************************************************************
   3  *
   4  * SPDX-License-Identifier: BSD-2-Clause
   5  *
   6  * Copyright (c) 2018-2023 Gavin D. Howard and contributors.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions are met:
  10  *
  11  * * Redistributions of source code must retain the above copyright notice, this
  12  *   list of conditions and the following disclaimer.
  13  *
  14  * * Redistributions in binary form must reproduce the above copyright notice,
  15  *   this list of conditions and the following disclaimer in the documentation
  16  *   and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28  * POSSIBILITY OF SUCH DAMAGE.
  29  *
  30  * *****************************************************************************
  31  *
  32  * Common code for the lexers.
  33  *
  34  */
  35
  36 #include <assert.h>
  37 #include <ctype.h>
  38 #include <stdbool.h>
  39 #include <string.h>
  40
  41 #include <lex.h>
  42 #include <vm.h>
  43 #include <bc.h>
  44
  45 void
  46 bc_lex_invalidChar(BcLex* l, char c)
  47 {
  48         l->t = BC_LEX_INVALID;
  49         bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
  50 }
  51
  52 void
  53 bc_lex_lineComment(BcLex* l)
  54 {
  55         l->t = BC_LEX_WHITESPACE;
  56         while (l->i < l->len && l->buf[l->i] != '\n')
  57         {
  58                 l->i += 1;
  59         }
  60 }
  61
  62 void
  63 bc_lex_comment(BcLex* l)
  64 {
  65         size_t i, nlines = 0;
  66         const char* buf;
  67         bool end = false, got_more;
  68         char c;
  69
  70         l->i += 1;
  71         l->t = BC_LEX_WHITESPACE;
  72
  73         // This loop is complex because it might need to request more data from
  74         // stdin if the comment is not ended. This loop is taken until the comment
  75         // is finished or we have EOF.
  76         do
  77         {
  78                 buf = l->buf;
  79                 got_more = false;
  80
  81                 // If we are in stdin mode, the buffer must be the one used for stdin.
  82                 assert(vm->mode != BC_MODE_STDIN || buf == vm->buffer.v);
  83
  84                 // Find the end of the comment.
  85                 for (i = l->i; !end; i += !end)
  86                 {
  87                         // While we don't have an asterisk, eat, but increment nlines.
  88                         for (; (c = buf[i]) && c != '*'; ++i)
  89                         {
  90                                 nlines += (c == '\n');
  91                         }
  92
  93                         // If this is true, we need to request more data.
  94                         if (BC_ERR(!c || buf[i + 1] == '\0'))
  95                         {
  96                                 // Read more, if possible.
  97                                 if (!vm->eof && l->mode != BC_MODE_FILE)
  98                                 {
  99                                         got_more = bc_lex_readLine(l);
 100                                 }
 101
 102                                 break;
 103                         }
 104
 105                         // If this turns true, we found the end. Yay!
 106                         end = (buf[i + 1] == '/');
 107                 }
 108         }
 109         while (got_more && !end);
 110
 111         // If we didn't find the end, barf.
 112         if (!end)
 113         {
 114                 l->i = i;
 115                 bc_lex_err(l, BC_ERR_PARSE_COMMENT);
 116         }
 117
 118         l->i = i + 2;
 119         l->line += nlines;
 120 }
 121
 122 void
 123 bc_lex_whitespace(BcLex* l)
 124 {
 125         char c;
 126
 127         l->t = BC_LEX_WHITESPACE;
 128
 129         // Eat. We don't eat newlines because they can be special.
 130         for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i])
 131         {
 132                 continue;
 133         }
 134 }
 135
 136 void
 137 bc_lex_commonTokens(BcLex* l, char c)
 138 {
 139         if (!c) l->t = BC_LEX_EOF;
 140         else if (c == '\n') l->t = BC_LEX_NLINE;
 141         else bc_lex_whitespace(l);
 142 }
 143
 144 /**
 145  * Parses a number.
 146  * @param l         The lexer.
 147  * @param start     The start character.
 148  * @param int_only  Whether this function should only look for an integer. This
 149  *                  is used to implement the exponent of scientific notation.
 150  */
 151 static size_t
 152 bc_lex_num(BcLex* l, char start, bool int_only)
 153 {
 154         const char* buf = l->buf + l->i;
 155         size_t i;
 156         char c;
 157         bool last_pt, pt = (start == '.');
 158
 159         // This loop looks complex. It is not. It is asking if the character is not
 160         // a nul byte and it if it a valid num character based on what we have found
 161         // thus far, or whether it is a backslash followed by a newline. I can do
 162         // i+1 on the buffer because the buffer must have a nul byte.
 163         for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) ||
 164                                      (c == '\\' && buf[i + 1] == '\n'));
 165              ++i)
 166         {
 167                 // I don't need to test that the next character is a newline because
 168                 // the loop condition above ensures that.
 169                 if (c == '\\')
 170                 {
 171                         i += 2;
 172
 173                         // Make sure to eat whitespace at the beginning of the line.
 174                         while (isspace(buf[i]) && buf[i] != '\n')
 175                         {
 176                                 i += 1;
 177                         }
 178
 179                         c = buf[i];
 180
 181                         // If the next character is not a number character, bail.
 182                         if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break;
 183                 }
 184
 185                 // Did we find the radix point?
 186                 last_pt = (c == '.');
 187
 188                 // If we did, and we already have one, then break because it's not part
 189                 // of this number.
 190                 if (pt && last_pt) break;
 191
 192                 // Set whether we have found a radix point.
 193                 pt = pt || last_pt;
 194
 195                 bc_vec_push(&l->str, &c);
 196         }
 197
 198         return i;
 199 }
 200
 201 void
 202 bc_lex_number(BcLex* l, char start)
 203 {
 204         l->t = BC_LEX_NUMBER;
 205
 206         // Make sure the string is clear.
 207         bc_vec_popAll(&l->str);
 208         bc_vec_push(&l->str, &start);
 209
 210         // Parse the number.
 211         l->i += bc_lex_num(l, start, false);
 212
 213 #if BC_ENABLE_EXTRA_MATH
 214         {
 215                 char c = l->buf[l->i];
 216
 217                 // Do we have a number in scientific notation?
 218                 if (c == 'e')
 219                 {
 220 #if BC_ENABLED
 221                         // Barf for POSIX.
 222                         if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM);
 223 #endif // BC_ENABLED
 224
 225                         // Push the e.
 226                         bc_vec_push(&l->str, &c);
 227                         l->i += 1;
 228                         c = l->buf[l->i];
 229
 230                         // Check for negative specifically because bc_lex_num() does not.
 231                         if (c == BC_LEX_NEG_CHAR)
 232                         {
 233                                 bc_vec_push(&l->str, &c);
 234                                 l->i += 1;
 235                                 c = l->buf[l->i];
 236                         }
 237
 238                         // We must have a number character, so barf if not.
 239                         if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true)))
 240                         {
 241                                 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
 242                         }
 243
 244                         // Parse the exponent.
 245                         l->i += bc_lex_num(l, 0, true);
 246                 }
 247         }
 248 #endif // BC_ENABLE_EXTRA_MATH
 249
 250         bc_vec_pushByte(&l->str, '\0');
 251 }
 252
 253 void
 254 bc_lex_name(BcLex* l)
 255 {
 256         size_t i = 0;
 257         const char* buf = l->buf + l->i - 1;
 258         char c = buf[i];
 259
 260         l->t = BC_LEX_NAME;
 261
 262         // Should be obvious. It's looking for valid characters.
 263         while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_')
 264         {
 265                 c = buf[++i];
 266         }
 267
 268         // Set the string to the identifier.
 269         bc_vec_string(&l->str, i, buf);
 270
 271         // Increment the index. We minus 1 because it has already been incremented.
 272         l->i += i - 1;
 273 }
 274
 275 void
 276 bc_lex_init(BcLex* l)
 277 {
 278         BC_SIG_ASSERT_LOCKED;
 279         assert(l != NULL);
 280         bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE);
 281 }
 282
 283 void
 284 bc_lex_free(BcLex* l)
 285 {
 286         BC_SIG_ASSERT_LOCKED;
 287         assert(l != NULL);
 288         bc_vec_free(&l->str);
 289 }
 290
 291 void
 292 bc_lex_file(BcLex* l, const char* file)
 293 {
 294         assert(l != NULL && file != NULL);
 295         l->line = 1;
 296         vm->file = file;
 297 }
 298
 299 void
 300 bc_lex_next(BcLex* l)
 301 {
 302         BC_SIG_ASSERT_LOCKED;
 303
 304         assert(l != NULL);
 305
 306         l->last = l->t;
 307
 308         // If this wasn't here, the line number would be off.
 309         l->line += (l->i != 0 && l->buf[l->i - 1] == '\n');
 310
 311         // If the last token was EOF, someone called this one too many times.
 312         if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF);
 313
 314         l->t = BC_LEX_EOF;
 315
 316         // We are done if this is true.
 317         if (l->i == l->len) return;
 318
 319         // Loop until failure or we don't have whitespace. This
 320         // is so the parser doesn't get inundated with whitespace.
 321         do
 322         {
 323                 vm->next(l);
 324         }
 325         while (l->t == BC_LEX_WHITESPACE);
 326 }
 327
 328 /**
 329  * Updates the buffer and len so that they are not invalidated when the stdin
 330  * buffer grows.
 331  * @param l     The lexer.
 332  * @param text  The text.
 333  * @param len   The length of the text.
 334  */
 335 static void
 336 bc_lex_fixText(BcLex* l, const char* text, size_t len)
 337 {
 338         l->buf = text;
 339         l->len = len;
 340 }
 341
 342 bool
 343 bc_lex_readLine(BcLex* l)
 344 {
 345         bool good;
 346
 347         // These are reversed because they should be already locked, but
 348         // bc_vm_readLine() needs them to be unlocked.
 349         BC_SIG_UNLOCK;
 350
 351         // Make sure we read from the appropriate place.
 352         switch (l->mode)
 353         {
 354                 case BC_MODE_EXPRS:
 355                 {
 356                         good = bc_vm_readBuf(false);
 357                         break;
 358                 }
 359
 360                 case BC_MODE_FILE:
 361                 {
 362                         good = false;
 363                         break;
 364                 }
 365
 366                 case BC_MODE_STDIN:
 367                 {
 368                         good = bc_vm_readLine(false);
 369                         break;
 370                 }
 371
 372 #ifdef __GNUC__
 373 #ifndef __clang__
 374                 default:
 375                 {
 376                         // We should never get here.
 377                         abort();
 378                 }
 379 #endif // __clang__
 380 #endif // __GNUC__
 381         }
 382
 383         BC_SIG_LOCK;
 384
 385         bc_lex_fixText(l, vm->buffer.v, vm->buffer.len - 1);
 386
 387         return good;
 388 }
 389
 390 void
 391 bc_lex_text(BcLex* l, const char* text, BcMode mode)
 392 {
 393         BC_SIG_ASSERT_LOCKED;
 394
 395         assert(l != NULL && text != NULL);
 396
 397         bc_lex_fixText(l, text, strlen(text));
 398         l->i = 0;
 399         l->t = l->last = BC_LEX_INVALID;
 400         l->mode = mode;
 401
 402         bc_lex_next(l);
 403 }