contrib/bc/src/lex.c

   1 /*
   2  * *****************************************************************************
   3  *
   4  * SPDX-License-Identifier: BSD-2-Clause
   5  *
   6  * Copyright (c) 2018-2021 Gavin D. Howard and contributors.
   7  *
   8  * Redistribution and use in source and binary forms, with or without
   9  * modification, are permitted provided that the following conditions are met:
  10  *
  11  * * Redistributions of source code must retain the above copyright notice, this
  12  *   list of conditions and the following disclaimer.
  13  *
  14  * * Redistributions in binary form must reproduce the above copyright notice,
  15  *   this list of conditions and the following disclaimer in the documentation
  16  *   and/or other materials provided with the distribution.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28  * POSSIBILITY OF SUCH DAMAGE.
  29  *
  30  * *****************************************************************************
  31  *
  32  * Common code for the lexers.
  33  *
  34  */
  35
  36 #include <assert.h>
  37 #include <ctype.h>
  38 #include <stdbool.h>
  39 #include <string.h>
  40
  41 #include <lex.h>
  42 #include <vm.h>
  43 #include <bc.h>
  44
  45 void bc_lex_invalidChar(BcLex *l, char c) {
  46         l->t = BC_LEX_INVALID;
  47         bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
  48 }
  49
  50 void bc_lex_lineComment(BcLex *l) {
  51         l->t = BC_LEX_WHITESPACE;
  52         while (l->i < l->len && l->buf[l->i] != '\n') l->i += 1;
  53 }
  54
  55 void bc_lex_comment(BcLex *l) {
  56
  57         size_t i, nlines = 0;
  58         const char *buf;
  59         bool end = false, got_more;
  60         char c;
  61
  62         l->i += 1;
  63         l->t = BC_LEX_WHITESPACE;
  64
  65         // This loop is complex because it might need to request more data from
  66         // stdin if the comment is not ended. This loop is taken until the comment
  67         // is finished or we have EOF.
  68         do {
  69
  70                 buf = l->buf;
  71                 got_more = false;
  72
  73                 // If we are in stdin mode, the buffer must be the one used for stdin.
  74                 assert(!vm.is_stdin || buf == vm.buffer.v);
  75
  76                 // Find the end of the comment.
  77                 for (i = l->i; !end; i += !end) {
  78
  79                         // While we don't have an asterisk, eat, but increment nlines.
  80                         for (; (c = buf[i]) && c != '*'; ++i) nlines += (c == '\n');
  81
  82                         // If this is true, we need to request more data.
  83                         if (BC_ERR(!c || buf[i + 1] == '\0')) {
  84
  85                                 // Read more.
  86                                 if (!vm.eof && l->is_stdin) got_more = bc_lex_readLine(l);
  87
  88                                 break;
  89                         }
  90
  91                         // If this turns true, we found the end. Yay!
  92                         end = (buf[i + 1] == '/');
  93                 }
  94
  95         } while (got_more && !end);
  96
  97         // If we didn't find the end, barf.
  98         if (!end) {
  99                 l->i = i;
 100                 bc_lex_err(l, BC_ERR_PARSE_COMMENT);
 101         }
 102
 103         l->i = i + 2;
 104         l->line += nlines;
 105 }
 106
 107 void bc_lex_whitespace(BcLex *l) {
 108
 109         char c;
 110
 111         l->t = BC_LEX_WHITESPACE;
 112
 113         // Eat. We don't eat newlines because they can be special.
 114         for (c = l->buf[l->i]; c != '\n' && isspace(c); c = l->buf[++l->i]);
 115 }
 116
 117 void bc_lex_commonTokens(BcLex *l, char c) {
 118         if (!c) l->t = BC_LEX_EOF;
 119         else if (c == '\n') l->t = BC_LEX_NLINE;
 120         else bc_lex_whitespace(l);
 121 }
 122
 123 /**
 124  * Parses a number.
 125  * @param l         The lexer.
 126  * @param start     The start character.
 127  * @param int_only  Whether this function should only look for an integer. This
 128  *                  is used to implement the exponent of scientific notation.
 129  */
 130 static size_t bc_lex_num(BcLex *l, char start, bool int_only) {
 131
 132         const char *buf = l->buf + l->i;
 133         size_t i;
 134         char c;
 135         bool last_pt, pt = (start == '.');
 136
 137         // This loop looks complex. It is not. It is asking if the character is not
 138         // a nul byte and it if it a valid num character based on what we have found
 139         // thus far, or whether it is a backslash followed by a newline. I can do
 140         // i+1 on the buffer because the buffer must have a nul byte.
 141         for (i = 0; (c = buf[i]) && (BC_LEX_NUM_CHAR(c, pt, int_only) ||
 142                                      (c == '\\' && buf[i + 1] == '\n')); ++i)
 143         {
 144                 // I don't need to test that the next character is a newline because
 145                 // the loop condition above ensures that.
 146                 if (c == '\\') {
 147
 148                         i += 2;
 149
 150                         // Make sure to eat whitespace at the beginning of the line.
 151                         while(isspace(buf[i]) && buf[i] != '\n') i += 1;
 152
 153                         c = buf[i];
 154
 155                         // If the next character is not a number character, bail.
 156                         if (!BC_LEX_NUM_CHAR(c, pt, int_only)) break;
 157                 }
 158
 159                 // Did we find the radix point?
 160                 last_pt = (c == '.');
 161
 162                 // If we did, and we already have one, then break because it's not part
 163                 // of this number.
 164                 if (pt && last_pt) break;
 165
 166                 // Set whether we have found a radix point.
 167                 pt = pt || last_pt;
 168
 169                 bc_vec_push(&l->str, &c);
 170         }
 171
 172         return i;
 173 }
 174
 175 void bc_lex_number(BcLex *l, char start) {
 176
 177         l->t = BC_LEX_NUMBER;
 178
 179         // Make sure the string is clear.
 180         bc_vec_popAll(&l->str);
 181         bc_vec_push(&l->str, &start);
 182
 183         // Parse the number.
 184         l->i += bc_lex_num(l, start, false);
 185
 186 #if BC_ENABLE_EXTRA_MATH
 187         {
 188                 char c = l->buf[l->i];
 189
 190                 // Do we have a number in scientific notation?
 191                 if (c == 'e') {
 192
 193 #if BC_ENABLED
 194                         // Barf for POSIX.
 195                         if (BC_IS_POSIX) bc_lex_err(l, BC_ERR_POSIX_EXP_NUM);
 196 #endif // BC_ENABLED
 197
 198                         // Push the e.
 199                         bc_vec_push(&l->str, &c);
 200                         l->i += 1;
 201                         c = l->buf[l->i];
 202
 203                         // Check for negative specifically because bc_lex_num() does not.
 204                         if (c == BC_LEX_NEG_CHAR) {
 205                                 bc_vec_push(&l->str, &c);
 206                                 l->i += 1;
 207                                 c = l->buf[l->i];
 208                         }
 209
 210                         // We must have a number character, so barf if not.
 211                         if (BC_ERR(!BC_LEX_NUM_CHAR(c, false, true)))
 212                                 bc_lex_verr(l, BC_ERR_PARSE_CHAR, c);
 213
 214                         // Parse the exponent.
 215                         l->i += bc_lex_num(l, 0, true);
 216                 }
 217         }
 218 #endif // BC_ENABLE_EXTRA_MATH
 219
 220         bc_vec_pushByte(&l->str, '\0');
 221 }
 222
 223 void bc_lex_name(BcLex *l) {
 224
 225         size_t i = 0;
 226         const char *buf = l->buf + l->i - 1;
 227         char c = buf[i];
 228
 229         l->t = BC_LEX_NAME;
 230
 231         // Should be obvious. It's looking for valid characters.
 232         while ((c >= 'a' && c <= 'z') || isdigit(c) || c == '_') c = buf[++i];
 233
 234         // Set the string to the identifier.
 235         bc_vec_string(&l->str, i, buf);
 236
 237         // Increment the index. We minus 1 because it has already been incremented.
 238         l->i += i - 1;
 239 }
 240
 241 void bc_lex_init(BcLex *l) {
 242         BC_SIG_ASSERT_LOCKED;
 243         assert(l != NULL);
 244         bc_vec_init(&l->str, sizeof(char), BC_DTOR_NONE);
 245 }
 246
 247 void bc_lex_free(BcLex *l) {
 248         BC_SIG_ASSERT_LOCKED;
 249         assert(l != NULL);
 250         bc_vec_free(&l->str);
 251 }
 252
 253 void bc_lex_file(BcLex *l, const char *file) {
 254         assert(l != NULL && file != NULL);
 255         l->line = 1;
 256         vm.file = file;
 257 }
 258
 259 void bc_lex_next(BcLex *l) {
 260
 261         assert(l != NULL);
 262
 263         l->last = l->t;
 264
 265         // If this wasn't here, the line number would be off.
 266         l->line += (l->i != 0 && l->buf[l->i - 1] == '\n');
 267
 268         // If the last token was EOF, someone called this one too many times.
 269         if (BC_ERR(l->last == BC_LEX_EOF)) bc_lex_err(l, BC_ERR_PARSE_EOF);
 270
 271         l->t = BC_LEX_EOF;
 272
 273         // We are done if this is true.
 274         if (l->i == l->len) return;
 275
 276         // Loop until failure or we don't have whitespace. This
 277         // is so the parser doesn't get inundated with whitespace.
 278         do {
 279                 vm.next(l);
 280         } while (l->t == BC_LEX_WHITESPACE);
 281 }
 282
 283 /**
 284  * Updates the buffer and len so that they are not invalidated when the stdin
 285  * buffer grows.
 286  * @param l     The lexer.
 287  * @param text  The text.
 288  * @param len   The length of the text.
 289  */
 290 static void bc_lex_fixText(BcLex *l, const char *text, size_t len) {
 291         l->buf = text;
 292         l->len = len;
 293 }
 294
 295 bool bc_lex_readLine(BcLex *l) {
 296
 297         bool good = bc_vm_readLine(false);
 298
 299         bc_lex_fixText(l, vm.buffer.v, vm.buffer.len - 1);
 300
 301         return good;
 302 }
 303
 304 void bc_lex_text(BcLex *l, const char *text, bool is_stdin) {
 305         assert(l != NULL && text != NULL);
 306         bc_lex_fixText(l, text, strlen(text));
 307         l->i = 0;
 308         l->t = l->last = BC_LEX_INVALID;
 309         l->is_stdin = is_stdin;
 310         bc_lex_next(l);
 311 }