contrib/gcclibs/libcpp/lex.c

   1 /* CPP Library - lexical analysis.
   2    Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    Contributed by Per Bothner, 1994-95.
   4    Based on CCCP program by Paul Rubin, June 1986
   5    Adapted to ANSI C, Richard Stallman, Jan 1987
   6    Broken out to separate file, Zack Weinberg, Mar 2000
   7
   8 This program is free software; you can redistribute it and/or modify it
   9 under the terms of the GNU General Public License as published by the
  10 Free Software Foundation; either version 2, or (at your option) any
  11 later version.
  12
  13 This program is distributed in the hope that it will be useful,
  14 but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  16 GNU General Public License for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with this program; if not, write to the Free Software
  20 Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "cpplib.h"
  25 #include "internal.h"
  26
  27 enum spell_type
  28 {
  29   SPELL_OPERATOR = 0,
  30   SPELL_IDENT,
  31   SPELL_LITERAL,
  32   SPELL_NONE
  33 };
  34
  35 struct token_spelling
  36 {
  37   enum spell_type category;
  38   const unsigned char *name;
  39 };
  40
  41 static const unsigned char *const digraph_spellings[] =
  42 { U"%:", U"%:%:", U"<:", U":>", U"<%", U"%>" };
  43
  44 #define OP(e, s) { SPELL_OPERATOR, U s  },
  45 #define TK(e, s) { SPELL_ ## s,    U #e },
  46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
  47 #undef OP
  48 #undef TK
  49
  50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
  51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
  52
  53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
  54 static int skip_line_comment (cpp_reader *);
  55 static void skip_whitespace (cpp_reader *, cppchar_t);
  56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
  57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
  58 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
  59                             unsigned int, enum cpp_ttype);
  60 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
  61 static int name_p (cpp_reader *, const cpp_string *);
  62 static tokenrun *next_tokenrun (tokenrun *);
  63
  64 static _cpp_buff *new_buff (size_t);
  65
  66
  67 /* Utility routine:
  68
  69    Compares, the token TOKEN to the NUL-terminated string STRING.
  70    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
  71 int
  72 cpp_ideq (const cpp_token *token, const char *string)
  73 {
  74   if (token->type != CPP_NAME)
  75     return 0;
  76
  77   return !ustrcmp (NODE_NAME (token->val.node), (const uchar *) string);
  78 }
  79
  80 /* Record a note TYPE at byte POS into the current cleaned logical
  81    line.  */
  82 static void
  83 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
  84 {
  85   if (buffer->notes_used == buffer->notes_cap)
  86     {
  87       buffer->notes_cap = buffer->notes_cap * 2 + 200;
  88       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
  89                                   buffer->notes_cap);
  90     }
  91
  92   buffer->notes[buffer->notes_used].pos = pos;
  93   buffer->notes[buffer->notes_used].type = type;
  94   buffer->notes_used++;
  95 }
  96
  97 /* Returns with a logical line that contains no escaped newlines or
  98    trigraphs.  This is a time-critical inner loop.  */
  99 void
 100 _cpp_clean_line (cpp_reader *pfile)
 101 {
 102   cpp_buffer *buffer;
 103   const uchar *s;
 104   uchar c, *d, *p;
 105
 106   buffer = pfile->buffer;
 107   buffer->cur_note = buffer->notes_used = 0;
 108   buffer->cur = buffer->line_base = buffer->next_line;
 109   buffer->need_line = false;
 110   s = buffer->next_line - 1;
 111
 112   if (!buffer->from_stage3)
 113     {
 114       const uchar *pbackslash = NULL;
 115
 116       /* Short circuit for the common case of an un-escaped line with
 117          no trigraphs.  The primary win here is by not writing any
 118          data back to memory until we have to.  */
 119       for (;;)
 120         {
 121           c = *++s;
 122           if (__builtin_expect (c == '\n', false)
 123               || __builtin_expect (c == '\r', false))
 124             {
 125               d = (uchar *) s;
 126
 127               if (__builtin_expect (s == buffer->rlimit, false))
 128                 goto done;
 129
 130               /* DOS line ending? */
 131               if (__builtin_expect (c == '\r', false)
 132                   && s[1] == '\n')
 133                 {
 134                   s++;
 135                   if (s == buffer->rlimit)
 136                     goto done;
 137                 }
 138
 139               if (__builtin_expect (pbackslash == NULL, true))
 140                 goto done;
 141
 142               /* Check for escaped newline.  */
 143               p = d;
 144               while (is_nvspace (p[-1]))
 145                 p--;
 146               if (p - 1 != pbackslash)
 147                 goto done;
 148
 149               /* Have an escaped newline; process it and proceed to
 150                  the slow path.  */
 151               add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
 152               d = p - 2;
 153               buffer->next_line = p - 1;
 154               break;
 155             }
 156           if (__builtin_expect (c == '\\', false))
 157             pbackslash = s;
 158           else if (__builtin_expect (c == '?', false)
 159                    && __builtin_expect (s[1] == '?', false)
 160                    && _cpp_trigraph_map[s[2]])
 161             {
 162               /* Have a trigraph.  We may or may not have to convert
 163                  it.  Add a line note regardless, for -Wtrigraphs.  */
 164               add_line_note (buffer, s, s[2]);
 165               if (CPP_OPTION (pfile, trigraphs))
 166                 {
 167                   /* We do, and that means we have to switch to the
 168                      slow path.  */
 169                   d = (uchar *) s;
 170                   *d = _cpp_trigraph_map[s[2]];
 171                   s += 2;
 172                   break;
 173                 }
 174             }
 175         }
 176
 177
 178       for (;;)
 179         {
 180           c = *++s;
 181           *++d = c;
 182
 183           if (c == '\n' || c == '\r')
 184             {
 185                   /* Handle DOS line endings.  */
 186               if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
 187                 s++;
 188               if (s == buffer->rlimit)
 189                 break;
 190
 191               /* Escaped?  */
 192               p = d;
 193               while (p != buffer->next_line && is_nvspace (p[-1]))
 194                 p--;
 195               if (p == buffer->next_line || p[-1] != '\\')
 196                 break;
 197
 198               add_line_note (buffer, p - 1, p != d ? ' ': '\\');
 199               d = p - 2;
 200               buffer->next_line = p - 1;
 201             }
 202           else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
 203             {
 204               /* Add a note regardless, for the benefit of -Wtrigraphs.  */
 205               add_line_note (buffer, d, s[2]);
 206               if (CPP_OPTION (pfile, trigraphs))
 207                 {
 208                   *d = _cpp_trigraph_map[s[2]];
 209                   s += 2;
 210                 }
 211             }
 212         }
 213     }
 214   else
 215     {
 216       do
 217         s++;
 218       while (*s != '\n' && *s != '\r');
 219       d = (uchar *) s;
 220
 221       /* Handle DOS line endings.  */
 222       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
 223         s++;
 224     }
 225
 226  done:
 227   *d = '\n';
 228   /* A sentinel note that should never be processed.  */
 229   add_line_note (buffer, d + 1, '\n');
 230   buffer->next_line = s + 1;
 231 }
 232
 233 /* Return true if the trigraph indicated by NOTE should be warned
 234    about in a comment.  */
 235 static bool
 236 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
 237 {
 238   const uchar *p;
 239
 240   /* Within comments we don't warn about trigraphs, unless the
 241      trigraph forms an escaped newline, as that may change
 242      behavior.  */
 243   if (note->type != '/')
 244     return false;
 245
 246   /* If -trigraphs, then this was an escaped newline iff the next note
 247      is coincident.  */
 248   if (CPP_OPTION (pfile, trigraphs))
 249     return note[1].pos == note->pos;
 250
 251   /* Otherwise, see if this forms an escaped newline.  */
 252   p = note->pos + 3;
 253   while (is_nvspace (*p))
 254     p++;
 255
 256   /* There might have been escaped newlines between the trigraph and the
 257      newline we found.  Hence the position test.  */
 258   return (*p == '\n' && p < note[1].pos);
 259 }
 260
 261 /* Process the notes created by add_line_note as far as the current
 262    location.  */
 263 void
 264 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
 265 {
 266   cpp_buffer *buffer = pfile->buffer;
 267
 268   for (;;)
 269     {
 270       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
 271       unsigned int col;
 272
 273       if (note->pos > buffer->cur)
 274         break;
 275
 276       buffer->cur_note++;
 277       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
 278
 279       if (note->type == '\\' || note->type == ' ')
 280         {
 281           if (note->type == ' ' && !in_comment)
 282             cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 283                                  "backslash and newline separated by space");
 284
 285           if (buffer->next_line > buffer->rlimit)
 286             {
 287               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
 288                                    "backslash-newline at end of file");
 289               /* Prevent "no newline at end of file" warning.  */
 290               buffer->next_line = buffer->rlimit;
 291             }
 292
 293           buffer->line_base = note->pos;
 294           CPP_INCREMENT_LINE (pfile, 0);
 295         }
 296       else if (_cpp_trigraph_map[note->type])
 297         {
 298           if (CPP_OPTION (pfile, warn_trigraphs)
 299               && (!in_comment || warn_in_comment (pfile, note)))
 300             {
 301               if (CPP_OPTION (pfile, trigraphs))
 302                 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 303                                      "trigraph ??%c converted to %c",
 304                                      note->type,
 305                                      (int) _cpp_trigraph_map[note->type]);
 306               else
 307                 {
 308                   cpp_error_with_line
 309                     (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
 310                      "trigraph ??%c ignored, use -trigraphs to enable",
 311                      note->type);
 312                 }
 313             }
 314         }
 315       else
 316         abort ();
 317     }
 318 }
 319
 320 /* Skip a C-style block comment.  We find the end of the comment by
 321    seeing if an asterisk is before every '/' we encounter.  Returns
 322    nonzero if comment terminated by EOF, zero otherwise.
 323
 324    Buffer->cur points to the initial asterisk of the comment.  */
 325 bool
 326 _cpp_skip_block_comment (cpp_reader *pfile)
 327 {
 328   cpp_buffer *buffer = pfile->buffer;
 329   const uchar *cur = buffer->cur;
 330   uchar c;
 331
 332   cur++;
 333   if (*cur == '/')
 334     cur++;
 335
 336   for (;;)
 337     {
 338       /* People like decorating comments with '*', so check for '/'
 339          instead for efficiency.  */
 340       c = *cur++;
 341
 342       if (c == '/')
 343         {
 344           if (cur[-2] == '*')
 345             break;
 346
 347           /* Warn about potential nested comments, but not if the '/'
 348              comes immediately before the true comment delimiter.
 349              Don't bother to get it right across escaped newlines.  */
 350           if (CPP_OPTION (pfile, warn_comments)
 351               && cur[0] == '*' && cur[1] != '/')
 352             {
 353               buffer->cur = cur;
 354               cpp_error_with_line (pfile, CPP_DL_WARNING,
 355                                    pfile->line_table->highest_line, CPP_BUF_COL (buffer),
 356                                    "\"/*\" within comment");
 357             }
 358         }
 359       else if (c == '\n')
 360         {
 361           unsigned int cols;
 362           buffer->cur = cur - 1;
 363           _cpp_process_line_notes (pfile, true);
 364           if (buffer->next_line >= buffer->rlimit)
 365             return true;
 366           _cpp_clean_line (pfile);
 367
 368           cols = buffer->next_line - buffer->line_base;
 369           CPP_INCREMENT_LINE (pfile, cols);
 370
 371           cur = buffer->cur;
 372         }
 373     }
 374
 375   buffer->cur = cur;
 376   _cpp_process_line_notes (pfile, true);
 377   return false;
 378 }
 379
 380 /* Skip a C++ line comment, leaving buffer->cur pointing to the
 381    terminating newline.  Handles escaped newlines.  Returns nonzero
 382    if a multiline comment.  */
 383 static int
 384 skip_line_comment (cpp_reader *pfile)
 385 {
 386   cpp_buffer *buffer = pfile->buffer;
 387   unsigned int orig_line = pfile->line_table->highest_line;
 388
 389   while (*buffer->cur != '\n')
 390     buffer->cur++;
 391
 392   _cpp_process_line_notes (pfile, true);
 393   return orig_line != pfile->line_table->highest_line;
 394 }
 395
 396 /* Skips whitespace, saving the next non-whitespace character.  */
 397 static void
 398 skip_whitespace (cpp_reader *pfile, cppchar_t c)
 399 {
 400   cpp_buffer *buffer = pfile->buffer;
 401   bool saw_NUL = false;
 402
 403   do
 404     {
 405       /* Horizontal space always OK.  */
 406       if (c == ' ' || c == '\t')
 407         ;
 408       /* Just \f \v or \0 left.  */
 409       else if (c == '\0')
 410         saw_NUL = true;
 411       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
 412         cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 413                              CPP_BUF_COL (buffer),
 414                              "%s in preprocessing directive",
 415                              c == '\f' ? "form feed" : "vertical tab");
 416
 417       c = *buffer->cur++;
 418     }
 419   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
 420   while (is_nvspace (c));
 421
 422   if (saw_NUL)
 423     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
 424
 425   buffer->cur--;
 426 }
 427
 428 /* See if the characters of a number token are valid in a name (no
 429    '.', '+' or '-').  */
 430 static int
 431 name_p (cpp_reader *pfile, const cpp_string *string)
 432 {
 433   unsigned int i;
 434
 435   for (i = 0; i < string->len; i++)
 436     if (!is_idchar (string->text[i]))
 437       return 0;
 438
 439   return 1;
 440 }
 441
 442 /* After parsing an identifier or other sequence, produce a warning about
 443    sequences not in NFC/NFKC.  */
 444 static void
 445 warn_about_normalization (cpp_reader *pfile,
 446                           const cpp_token *token,
 447                           const struct normalize_state *s)
 448 {
 449   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
 450       && !pfile->state.skipping)
 451     {
 452       /* Make sure that the token is printed using UCNs, even
 453          if we'd otherwise happily print UTF-8.  */
 454       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
 455       size_t sz;
 456
 457       sz = cpp_spell_token (pfile, token, buf, false) - buf;
 458       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 459         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 460                              "`%.*s' is not in NFKC", (int) sz, buf);
 461       else
 462         cpp_error_with_line (pfile, CPP_DL_WARNING, token->src_loc, 0,
 463                              "`%.*s' is not in NFC", (int) sz, buf);
 464     }
 465 }
 466
 467 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
 468    an identifier.  FIRST is TRUE if this starts an identifier.  */
 469 static bool
 470 forms_identifier_p (cpp_reader *pfile, int first,
 471                     struct normalize_state *state)
 472 {
 473   cpp_buffer *buffer = pfile->buffer;
 474
 475   if (*buffer->cur == '$')
 476     {
 477       if (!CPP_OPTION (pfile, dollars_in_ident))
 478         return false;
 479
 480       buffer->cur++;
 481       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
 482         {
 483           CPP_OPTION (pfile, warn_dollars) = 0;
 484           cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
 485         }
 486
 487       return true;
 488     }
 489
 490   /* Is this a syntactically valid UCN?  */
 491   if (CPP_OPTION (pfile, extended_identifiers)
 492       && *buffer->cur == '\\'
 493       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
 494     {
 495       buffer->cur += 2;
 496       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
 497                           state))
 498         return true;
 499       buffer->cur -= 2;
 500     }
 501
 502   return false;
 503 }
 504
 505 /* Lex an identifier starting at BUFFER->CUR - 1.  */
 506 static cpp_hashnode *
 507 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
 508                 struct normalize_state *nst)
 509 {
 510   cpp_hashnode *result;
 511   const uchar *cur;
 512   unsigned int len;
 513   unsigned int hash = HT_HASHSTEP (0, *base);
 514
 515   cur = pfile->buffer->cur;
 516   if (! starts_ucn)
 517     while (ISIDNUM (*cur))
 518       {
 519         hash = HT_HASHSTEP (hash, *cur);
 520         cur++;
 521       }
 522   pfile->buffer->cur = cur;
 523   if (starts_ucn || forms_identifier_p (pfile, false, nst))
 524     {
 525       /* Slower version for identifiers containing UCNs (or $).  */
 526       do {
 527         while (ISIDNUM (*pfile->buffer->cur))
 528           {
 529             pfile->buffer->cur++;
 530             NORMALIZE_STATE_UPDATE_IDNUM (nst);
 531           }
 532       } while (forms_identifier_p (pfile, false, nst));
 533       result = _cpp_interpret_identifier (pfile, base,
 534                                           pfile->buffer->cur - base);
 535     }
 536   else
 537     {
 538       len = cur - base;
 539       hash = HT_HASHFINISH (hash, len);
 540
 541       result = (cpp_hashnode *)
 542         ht_lookup_with_hash (pfile->hash_table, base, len, hash, HT_ALLOC);
 543     }
 544
 545   /* Rarely, identifiers require diagnostics when lexed.  */
 546   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
 547                         && !pfile->state.skipping, 0))
 548     {
 549       /* It is allowed to poison the same identifier twice.  */
 550       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
 551         cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
 552                    NODE_NAME (result));
 553
 554       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
 555          replacement list of a variadic macro.  */
 556       if (result == pfile->spec_nodes.n__VA_ARGS__
 557           && !pfile->state.va_args_ok)
 558         cpp_error (pfile, CPP_DL_PEDWARN,
 559                    "__VA_ARGS__ can only appear in the expansion"
 560                    " of a C99 variadic macro");
 561     }
 562
 563   return result;
 564 }
 565
 566 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
 567 static void
 568 lex_number (cpp_reader *pfile, cpp_string *number,
 569             struct normalize_state *nst)
 570 {
 571   const uchar *cur;
 572   const uchar *base;
 573   uchar *dest;
 574
 575   base = pfile->buffer->cur - 1;
 576   do
 577     {
 578       cur = pfile->buffer->cur;
 579
 580       /* N.B. ISIDNUM does not include $.  */
 581       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
 582         {
 583           cur++;
 584           NORMALIZE_STATE_UPDATE_IDNUM (nst);
 585         }
 586
 587       pfile->buffer->cur = cur;
 588     }
 589   while (forms_identifier_p (pfile, false, nst));
 590
 591   number->len = cur - base;
 592   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
 593   memcpy (dest, base, number->len);
 594   dest[number->len] = '\0';
 595   number->text = dest;
 596 }
 597
 598 /* Create a token of type TYPE with a literal spelling.  */
 599 static void
 600 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
 601                 unsigned int len, enum cpp_ttype type)
 602 {
 603   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
 604
 605   memcpy (dest, base, len);
 606   dest[len] = '\0';
 607   token->type = type;
 608   token->val.str.len = len;
 609   token->val.str.text = dest;
 610 }
 611
 612 /* Lexes a string, character constant, or angle-bracketed header file
 613    name.  The stored string contains the spelling, including opening
 614    quote and leading any leading 'L'.  It returns the type of the
 615    literal, or CPP_OTHER if it was not properly terminated.
 616
 617    The spelling is NUL-terminated, but it is not guaranteed that this
 618    is the first NUL since embedded NULs are preserved.  */
 619 static void
 620 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
 621 {
 622   bool saw_NUL = false;
 623   const uchar *cur;
 624   cppchar_t terminator;
 625   enum cpp_ttype type;
 626
 627   cur = base;
 628   terminator = *cur++;
 629   if (terminator == 'L')
 630     terminator = *cur++;
 631   if (terminator == '\"')
 632     type = *base == 'L' ? CPP_WSTRING: CPP_STRING;
 633   else if (terminator == '\'')
 634     type = *base == 'L' ? CPP_WCHAR: CPP_CHAR;
 635   else
 636     terminator = '>', type = CPP_HEADER_NAME;
 637
 638   for (;;)
 639     {
 640       cppchar_t c = *cur++;
 641
 642       /* In #include-style directives, terminators are not escapable.  */
 643       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
 644         cur++;
 645       else if (c == terminator)
 646         break;
 647       else if (c == '\n')
 648         {
 649           cur--;
 650           type = CPP_OTHER;
 651           break;
 652         }
 653       else if (c == '\0')
 654         saw_NUL = true;
 655     }
 656
 657   if (saw_NUL && !pfile->state.skipping)
 658     cpp_error (pfile, CPP_DL_WARNING,
 659                "null character(s) preserved in literal");
 660
 661   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
 662     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
 663                (int) terminator);
 664
 665   pfile->buffer->cur = cur;
 666   create_literal (pfile, token, base, cur - base, type);
 667 }
 668
 669 /* The stored comment includes the comment start and any terminator.  */
 670 static void
 671 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
 672               cppchar_t type)
 673 {
 674   unsigned char *buffer;
 675   unsigned int len, clen;
 676
 677   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
 678
 679   /* C++ comments probably (not definitely) have moved past a new
 680      line, which we don't want to save in the comment.  */
 681   if (is_vspace (pfile->buffer->cur[-1]))
 682     len--;
 683
 684   /* If we are currently in a directive, then we need to store all
 685      C++ comments as C comments internally, and so we need to
 686      allocate a little extra space in that case.
 687
 688      Note that the only time we encounter a directive here is
 689      when we are saving comments in a "#define".  */
 690   clen = (pfile->state.in_directive && type == '/') ? len + 2 : len;
 691
 692   buffer = _cpp_unaligned_alloc (pfile, clen);
 693
 694   token->type = CPP_COMMENT;
 695   token->val.str.len = clen;
 696   token->val.str.text = buffer;
 697
 698   buffer[0] = '/';
 699   memcpy (buffer + 1, from, len - 1);
 700
 701   /* Finish conversion to a C comment, if necessary.  */
 702   if (pfile->state.in_directive && type == '/')
 703     {
 704       buffer[1] = '*';
 705       buffer[clen - 2] = '*';
 706       buffer[clen - 1] = '/';
 707     }
 708 }
 709
 710 /* Allocate COUNT tokens for RUN.  */
 711 void
 712 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
 713 {
 714   run->base = XNEWVEC (cpp_token, count);
 715   run->limit = run->base + count;
 716   run->next = NULL;
 717 }
 718
 719 /* Returns the next tokenrun, or creates one if there is none.  */
 720 static tokenrun *
 721 next_tokenrun (tokenrun *run)
 722 {
 723   if (run->next == NULL)
 724     {
 725       run->next = XNEW (tokenrun);
 726       run->next->prev = run;
 727       _cpp_init_tokenrun (run->next, 250);
 728     }
 729
 730   return run->next;
 731 }
 732
 733 /* Allocate a single token that is invalidated at the same time as the
 734    rest of the tokens on the line.  Has its line and col set to the
 735    same as the last lexed token, so that diagnostics appear in the
 736    right place.  */
 737 cpp_token *
 738 _cpp_temp_token (cpp_reader *pfile)
 739 {
 740   cpp_token *old, *result;
 741
 742   old = pfile->cur_token - 1;
 743   if (pfile->cur_token == pfile->cur_run->limit)
 744     {
 745       pfile->cur_run = next_tokenrun (pfile->cur_run);
 746       pfile->cur_token = pfile->cur_run->base;
 747     }
 748
 749   result = pfile->cur_token++;
 750   result->src_loc = old->src_loc;
 751   return result;
 752 }
 753
 754 /* Lex a token into RESULT (external interface).  Takes care of issues
 755    like directive handling, token lookahead, multiple include
 756    optimization and skipping.  */
 757 const cpp_token *
 758 _cpp_lex_token (cpp_reader *pfile)
 759 {
 760   cpp_token *result;
 761
 762   for (;;)
 763     {
 764       if (pfile->cur_token == pfile->cur_run->limit)
 765         {
 766           pfile->cur_run = next_tokenrun (pfile->cur_run);
 767           pfile->cur_token = pfile->cur_run->base;
 768         }
 769       /* We assume that the current token is somewhere in the current
 770          run.  */
 771       if (pfile->cur_token < pfile->cur_run->base
 772           || pfile->cur_token >= pfile->cur_run->limit)
 773         abort ();
 774
 775       if (pfile->lookaheads)
 776         {
 777           pfile->lookaheads--;
 778           result = pfile->cur_token++;
 779         }
 780       else
 781         result = _cpp_lex_direct (pfile);
 782
 783       if (result->flags & BOL)
 784         {
 785           /* Is this a directive.  If _cpp_handle_directive returns
 786              false, it is an assembler #.  */
 787           if (result->type == CPP_HASH
 788               /* 6.10.3 p 11: Directives in a list of macro arguments
 789                  gives undefined behavior.  This implementation
 790                  handles the directive as normal.  */
 791               && pfile->state.parsing_args != 1)
 792             {
 793               if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
 794                 {
 795                   if (pfile->directive_result.type == CPP_PADDING)
 796                     continue;
 797                   result = &pfile->directive_result;
 798                 }
 799             }
 800           else if (pfile->state.in_deferred_pragma)
 801             result = &pfile->directive_result;
 802
 803           if (pfile->cb.line_change && !pfile->state.skipping)
 804             pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
 805         }
 806
 807       /* We don't skip tokens in directives.  */
 808       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
 809         break;
 810
 811       /* Outside a directive, invalidate controlling macros.  At file
 812          EOF, _cpp_lex_direct takes care of popping the buffer, so we never
 813          get here and MI optimization works.  */
 814       pfile->mi_valid = false;
 815
 816       if (!pfile->state.skipping || result->type == CPP_EOF)
 817         break;
 818     }
 819
 820   return result;
 821 }
 822
 823 /* Returns true if a fresh line has been loaded.  */
 824 bool
 825 _cpp_get_fresh_line (cpp_reader *pfile)
 826 {
 827   int return_at_eof;
 828
 829   /* We can't get a new line until we leave the current directive.  */
 830   if (pfile->state.in_directive)
 831     return false;
 832
 833   for (;;)
 834     {
 835       cpp_buffer *buffer = pfile->buffer;
 836
 837       if (!buffer->need_line)
 838         return true;
 839
 840       if (buffer->next_line < buffer->rlimit)
 841         {
 842           _cpp_clean_line (pfile);
 843           return true;
 844         }
 845
 846       /* First, get out of parsing arguments state.  */
 847       if (pfile->state.parsing_args)
 848         return false;
 849
 850       /* End of buffer.  Non-empty files should end in a newline.  */
 851       if (buffer->buf != buffer->rlimit
 852           && buffer->next_line > buffer->rlimit
 853           && !buffer->from_stage3)
 854         {
 855           /* Clip to buffer size.  */
 856           buffer->next_line = buffer->rlimit;
 857           /* APPLE LOCAL begin suppress no newline warning.  */
 858           if ( CPP_OPTION (pfile, warn_newline_at_eof))
 859             {
 860               cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
 861                                    CPP_BUF_COLUMN (buffer, buffer->cur),
 862                                    "no newline at end of file");
 863             }
 864           /* APPLE LOCAL end suppress no newline warning.  */
 865         }
 866
 867       return_at_eof = buffer->return_at_eof;
 868       _cpp_pop_buffer (pfile);
 869       if (pfile->buffer == NULL || return_at_eof)
 870         return false;
 871     }
 872 }
 873
 874 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)          \
 875   do                                                    \
 876     {                                                   \
 877       result->type = ELSE_TYPE;                         \
 878       if (*buffer->cur == CHAR)                         \
 879         buffer->cur++, result->type = THEN_TYPE;        \
 880     }                                                   \
 881   while (0)
 882
 883 /* Lex a token into pfile->cur_token, which is also incremented, to
 884    get diagnostics pointing to the correct location.
 885
 886    Does not handle issues such as token lookahead, multiple-include
 887    optimization, directives, skipping etc.  This function is only
 888    suitable for use by _cpp_lex_token, and in special cases like
 889    lex_expansion_token which doesn't care for any of these issues.
 890
 891    When meeting a newline, returns CPP_EOF if parsing a directive,
 892    otherwise returns to the start of the token buffer if permissible.
 893    Returns the location of the lexed token.  */
 894 cpp_token *
 895 _cpp_lex_direct (cpp_reader *pfile)
 896 {
 897   cppchar_t c;
 898   cpp_buffer *buffer;
 899   const unsigned char *comment_start;
 900   cpp_token *result = pfile->cur_token++;
 901
 902  fresh_line:
 903   result->flags = 0;
 904   buffer = pfile->buffer;
 905   if (buffer->need_line)
 906     {
 907       if (pfile->state.in_deferred_pragma)
 908         {
 909           result->type = CPP_PRAGMA_EOL;
 910           pfile->state.in_deferred_pragma = false;
 911           if (!pfile->state.pragma_allow_expansion)
 912             pfile->state.prevent_expansion--;
 913           return result;
 914         }
 915       if (!_cpp_get_fresh_line (pfile))
 916         {
 917           result->type = CPP_EOF;
 918           if (!pfile->state.in_directive)
 919             {
 920               /* Tell the compiler the line number of the EOF token.  */
 921               result->src_loc = pfile->line_table->highest_line;
 922               result->flags = BOL;
 923             }
 924           return result;
 925         }
 926       if (!pfile->keep_tokens)
 927         {
 928           pfile->cur_run = &pfile->base_run;
 929           result = pfile->base_run.base;
 930           pfile->cur_token = result + 1;
 931         }
 932       result->flags = BOL;
 933       if (pfile->state.parsing_args == 2)
 934         result->flags |= PREV_WHITE;
 935     }
 936   buffer = pfile->buffer;
 937  update_tokens_line:
 938   result->src_loc = pfile->line_table->highest_line;
 939
 940  skipped_white:
 941   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
 942       && !pfile->overlaid_buffer)
 943     {
 944       _cpp_process_line_notes (pfile, false);
 945       result->src_loc = pfile->line_table->highest_line;
 946     }
 947   c = *buffer->cur++;
 948
 949   LINEMAP_POSITION_FOR_COLUMN (result->src_loc, pfile->line_table,
 950                                CPP_BUF_COLUMN (buffer, buffer->cur));
 951
 952   switch (c)
 953     {
 954     case ' ': case '\t': case '\f': case '\v': case '\0':
 955       result->flags |= PREV_WHITE;
 956       skip_whitespace (pfile, c);
 957       goto skipped_white;
 958
 959     case '\n':
 960       if (buffer->cur < buffer->rlimit)
 961         CPP_INCREMENT_LINE (pfile, 0);
 962       buffer->need_line = true;
 963       goto fresh_line;
 964
 965     case '0': case '1': case '2': case '3': case '4':
 966     case '5': case '6': case '7': case '8': case '9':
 967       {
 968         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
 969         result->type = CPP_NUMBER;
 970         lex_number (pfile, &result->val.str, &nst);
 971         warn_about_normalization (pfile, result, &nst);
 972         break;
 973       }
 974
 975     case 'L':
 976       /* 'L' may introduce wide characters or strings.  */
 977       if (*buffer->cur == '\'' || *buffer->cur == '"')
 978         {
 979           lex_string (pfile, result, buffer->cur - 1);
 980           break;
 981         }
 982       /* Fall through.  */
 983
 984     case '_':
 985     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
 986     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
 987     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
 988     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
 989     case 'y': case 'z':
 990     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
 991     case 'G': case 'H': case 'I': case 'J': case 'K':
 992     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
 993     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
 994     case 'Y': case 'Z':
 995       result->type = CPP_NAME;
 996       {
 997         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
 998         result->val.node = lex_identifier (pfile, buffer->cur - 1, false,
 999                                            &nst);
1000         warn_about_normalization (pfile, result, &nst);
1001       }
1002
1003       /* Convert named operators to their proper types.  */
1004       if (result->val.node->flags & NODE_OPERATOR)
1005         {
1006           result->flags |= NAMED_OP;
1007           result->type = (enum cpp_ttype) result->val.node->directive_index;
1008         }
1009       break;
1010
1011     case '\'':
1012     case '"':
1013       lex_string (pfile, result, buffer->cur - 1);
1014       break;
1015
1016     case '/':
1017       /* A potential block or line comment.  */
1018       comment_start = buffer->cur;
1019       c = *buffer->cur;
1020
1021       if (c == '*')
1022         {
1023           if (_cpp_skip_block_comment (pfile))
1024             cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
1025         }
1026       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
1027                             || cpp_in_system_header (pfile)))
1028         {
1029           /* Warn about comments only if pedantically GNUC89, and not
1030              in system headers.  */
1031           if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
1032               && ! buffer->warned_cplusplus_comments)
1033             {
1034               cpp_error (pfile, CPP_DL_PEDWARN,
1035                          "C++ style comments are not allowed in ISO C90");
1036               cpp_error (pfile, CPP_DL_PEDWARN,
1037                          "(this will be reported only once per input file)");
1038               buffer->warned_cplusplus_comments = 1;
1039             }
1040
1041           if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
1042             cpp_error (pfile, CPP_DL_WARNING, "multi-line comment");
1043         }
1044       else if (c == '=')
1045         {
1046           buffer->cur++;
1047           result->type = CPP_DIV_EQ;
1048           break;
1049         }
1050       else
1051         {
1052           result->type = CPP_DIV;
1053           break;
1054         }
1055
1056       if (!pfile->state.save_comments)
1057         {
1058           result->flags |= PREV_WHITE;
1059           goto update_tokens_line;
1060         }
1061
1062       /* Save the comment as a token in its own right.  */
1063       save_comment (pfile, result, comment_start, c);
1064       break;
1065
1066     case '<':
1067       if (pfile->state.angled_headers)
1068         {
1069           lex_string (pfile, result, buffer->cur - 1);
1070           break;
1071         }
1072
1073       result->type = CPP_LESS;
1074       if (*buffer->cur == '=')
1075         buffer->cur++, result->type = CPP_LESS_EQ;
1076       else if (*buffer->cur == '<')
1077         {
1078           buffer->cur++;
1079           IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
1080         }
1081       else if (CPP_OPTION (pfile, digraphs))
1082         {
1083           if (*buffer->cur == ':')
1084             {
1085               buffer->cur++;
1086               result->flags |= DIGRAPH;
1087               result->type = CPP_OPEN_SQUARE;
1088             }
1089           else if (*buffer->cur == '%')
1090             {
1091               buffer->cur++;
1092               result->flags |= DIGRAPH;
1093               result->type = CPP_OPEN_BRACE;
1094             }
1095         }
1096       break;
1097
1098     case '>':
1099       result->type = CPP_GREATER;
1100       if (*buffer->cur == '=')
1101         buffer->cur++, result->type = CPP_GREATER_EQ;
1102       else if (*buffer->cur == '>')
1103         {
1104           buffer->cur++;
1105           IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
1106         }
1107       break;
1108
1109     case '%':
1110       result->type = CPP_MOD;
1111       if (*buffer->cur == '=')
1112         buffer->cur++, result->type = CPP_MOD_EQ;
1113       else if (CPP_OPTION (pfile, digraphs))
1114         {
1115           if (*buffer->cur == ':')
1116             {
1117               buffer->cur++;
1118               result->flags |= DIGRAPH;
1119               result->type = CPP_HASH;
1120               if (*buffer->cur == '%' && buffer->cur[1] == ':')
1121                 buffer->cur += 2, result->type = CPP_PASTE;
1122             }
1123           else if (*buffer->cur == '>')
1124             {
1125               buffer->cur++;
1126               result->flags |= DIGRAPH;
1127               result->type = CPP_CLOSE_BRACE;
1128             }
1129         }
1130       break;
1131
1132     case '.':
1133       result->type = CPP_DOT;
1134       if (ISDIGIT (*buffer->cur))
1135         {
1136           struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1137           result->type = CPP_NUMBER;
1138           lex_number (pfile, &result->val.str, &nst);
1139           warn_about_normalization (pfile, result, &nst);
1140         }
1141       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
1142         buffer->cur += 2, result->type = CPP_ELLIPSIS;
1143       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1144         buffer->cur++, result->type = CPP_DOT_STAR;
1145       break;
1146
1147     case '+':
1148       result->type = CPP_PLUS;
1149       if (*buffer->cur == '+')
1150         buffer->cur++, result->type = CPP_PLUS_PLUS;
1151       else if (*buffer->cur == '=')
1152         buffer->cur++, result->type = CPP_PLUS_EQ;
1153       break;
1154
1155     case '-':
1156       result->type = CPP_MINUS;
1157       if (*buffer->cur == '>')
1158         {
1159           buffer->cur++;
1160           result->type = CPP_DEREF;
1161           if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
1162             buffer->cur++, result->type = CPP_DEREF_STAR;
1163         }
1164       else if (*buffer->cur == '-')
1165         buffer->cur++, result->type = CPP_MINUS_MINUS;
1166       else if (*buffer->cur == '=')
1167         buffer->cur++, result->type = CPP_MINUS_EQ;
1168       break;
1169
1170     case '&':
1171       result->type = CPP_AND;
1172       if (*buffer->cur == '&')
1173         buffer->cur++, result->type = CPP_AND_AND;
1174       else if (*buffer->cur == '=')
1175         buffer->cur++, result->type = CPP_AND_EQ;
1176       break;
1177
1178     case '|':
1179       result->type = CPP_OR;
1180       if (*buffer->cur == '|')
1181         buffer->cur++, result->type = CPP_OR_OR;
1182       else if (*buffer->cur == '=')
1183         buffer->cur++, result->type = CPP_OR_EQ;
1184       break;
1185
1186     case ':':
1187       result->type = CPP_COLON;
1188       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
1189         buffer->cur++, result->type = CPP_SCOPE;
1190       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
1191         {
1192           buffer->cur++;
1193           result->flags |= DIGRAPH;
1194           result->type = CPP_CLOSE_SQUARE;
1195         }
1196       break;
1197
1198     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
1199     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
1200     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
1201     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
1202     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); break;
1203
1204     case '?': result->type = CPP_QUERY; break;
1205     case '~': result->type = CPP_COMPL; break;
1206     case ',': result->type = CPP_COMMA; break;
1207     case '(': result->type = CPP_OPEN_PAREN; break;
1208     case ')': result->type = CPP_CLOSE_PAREN; break;
1209     case '[': result->type = CPP_OPEN_SQUARE; break;
1210     case ']': result->type = CPP_CLOSE_SQUARE; break;
1211     case '{': result->type = CPP_OPEN_BRACE; break;
1212     case '}': result->type = CPP_CLOSE_BRACE; break;
1213     case ';': result->type = CPP_SEMICOLON; break;
1214
1215       /* @ is a punctuator in Objective-C.  */
1216     case '@': result->type = CPP_ATSIGN; break;
1217
1218     case '$':
1219     case '\\':
1220       {
1221         const uchar *base = --buffer->cur;
1222         struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1223
1224         if (forms_identifier_p (pfile, true, &nst))
1225           {
1226             result->type = CPP_NAME;
1227             result->val.node = lex_identifier (pfile, base, true, &nst);
1228             warn_about_normalization (pfile, result, &nst);
1229             break;
1230           }
1231         buffer->cur++;
1232       }
1233
1234     default:
1235       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
1236       break;
1237     }
1238
1239   return result;
1240 }
1241
1242 /* An upper bound on the number of bytes needed to spell TOKEN.
1243    Does not include preceding whitespace.  */
1244 unsigned int
1245 cpp_token_len (const cpp_token *token)
1246 {
1247   unsigned int len;
1248
1249   switch (TOKEN_SPELL (token))
1250     {
1251     default:            len = 4;                                break;
1252     case SPELL_LITERAL: len = token->val.str.len;               break;
1253     case SPELL_IDENT:   len = NODE_LEN (token->val.node) * 10;  break;
1254     }
1255
1256   return len;
1257 }
1258
1259 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
1260    Return the number of bytes read out of NAME.  (There are always
1261    10 bytes written to BUFFER.)  */
1262
1263 static size_t
1264 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
1265 {
1266   int j;
1267   int ucn_len = 0;
1268   int ucn_len_c;
1269   unsigned t;
1270   unsigned long utf32;
1271
1272   /* Compute the length of the UTF-8 sequence.  */
1273   for (t = *name; t & 0x80; t <<= 1)
1274     ucn_len++;
1275
1276   utf32 = *name & (0x7F >> ucn_len);
1277   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
1278     {
1279       utf32 = (utf32 << 6) | (*++name & 0x3F);
1280
1281       /* Ill-formed UTF-8.  */
1282       if ((*name & ~0x3F) != 0x80)
1283         abort ();
1284     }
1285
1286   *buffer++ = '\\';
1287   *buffer++ = 'U';
1288   for (j = 7; j >= 0; j--)
1289     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
1290   return ucn_len;
1291 }
1292
1293
1294 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
1295    already contain the enough space to hold the token's spelling.
1296    Returns a pointer to the character after the last character written.
1297    FORSTRING is true if this is to be the spelling after translation
1298    phase 1 (this is different for UCNs).
1299    FIXME: Would be nice if we didn't need the PFILE argument.  */
1300 unsigned char *
1301 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
1302                  unsigned char *buffer, bool forstring)
1303 {
1304   switch (TOKEN_SPELL (token))
1305     {
1306     case SPELL_OPERATOR:
1307       {
1308         const unsigned char *spelling;
1309         unsigned char c;
1310
1311         if (token->flags & DIGRAPH)
1312           spelling
1313             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1314         else if (token->flags & NAMED_OP)
1315           goto spell_ident;
1316         else
1317           spelling = TOKEN_NAME (token);
1318
1319         while ((c = *spelling++) != '\0')
1320           *buffer++ = c;
1321       }
1322       break;
1323
1324     spell_ident:
1325     case SPELL_IDENT:
1326       if (forstring)
1327         {
1328           memcpy (buffer, NODE_NAME (token->val.node),
1329                   NODE_LEN (token->val.node));
1330           buffer += NODE_LEN (token->val.node);
1331         }
1332       else
1333         {
1334           size_t i;
1335           const unsigned char * name = NODE_NAME (token->val.node);
1336
1337           for (i = 0; i < NODE_LEN (token->val.node); i++)
1338             if (name[i] & ~0x7F)
1339               {
1340                 i += utf8_to_ucn (buffer, name + i) - 1;
1341                 buffer += 10;
1342               }
1343             else
1344               *buffer++ = NODE_NAME (token->val.node)[i];
1345         }
1346       break;
1347
1348     case SPELL_LITERAL:
1349       memcpy (buffer, token->val.str.text, token->val.str.len);
1350       buffer += token->val.str.len;
1351       break;
1352
1353     case SPELL_NONE:
1354       cpp_error (pfile, CPP_DL_ICE,
1355                  "unspellable token %s", TOKEN_NAME (token));
1356       break;
1357     }
1358
1359   return buffer;
1360 }
1361
1362 /* Returns TOKEN spelt as a null-terminated string.  The string is
1363    freed when the reader is destroyed.  Useful for diagnostics.  */
1364 unsigned char *
1365 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
1366 {
1367   unsigned int len = cpp_token_len (token) + 1;
1368   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
1369
1370   end = cpp_spell_token (pfile, token, start, false);
1371   end[0] = '\0';
1372
1373   return start;
1374 }
1375
1376 /* Used by C front ends, which really should move to using
1377    cpp_token_as_text.  */
1378 const char *
1379 cpp_type2name (enum cpp_ttype type)
1380 {
1381   return (const char *) token_spellings[type].name;
1382 }
1383
1384 /* Writes the spelling of token to FP, without any preceding space.
1385    Separated from cpp_spell_token for efficiency - to avoid stdio
1386    double-buffering.  */
1387 void
1388 cpp_output_token (const cpp_token *token, FILE *fp)
1389 {
1390   switch (TOKEN_SPELL (token))
1391     {
1392     case SPELL_OPERATOR:
1393       {
1394         const unsigned char *spelling;
1395         int c;
1396
1397         if (token->flags & DIGRAPH)
1398           spelling
1399             = digraph_spellings[(int) token->type - (int) CPP_FIRST_DIGRAPH];
1400         else if (token->flags & NAMED_OP)
1401           goto spell_ident;
1402         else
1403           spelling = TOKEN_NAME (token);
1404
1405         c = *spelling;
1406         do
1407           putc (c, fp);
1408         while ((c = *++spelling) != '\0');
1409       }
1410       break;
1411
1412     spell_ident:
1413     case SPELL_IDENT:
1414       {
1415         size_t i;
1416         const unsigned char * name = NODE_NAME (token->val.node);
1417
1418         for (i = 0; i < NODE_LEN (token->val.node); i++)
1419           if (name[i] & ~0x7F)
1420             {
1421               unsigned char buffer[10];
1422               i += utf8_to_ucn (buffer, name + i) - 1;
1423               fwrite (buffer, 1, 10, fp);
1424             }
1425           else
1426             fputc (NODE_NAME (token->val.node)[i], fp);
1427       }
1428       break;
1429
1430     case SPELL_LITERAL:
1431       fwrite (token->val.str.text, 1, token->val.str.len, fp);
1432       break;
1433
1434     case SPELL_NONE:
1435       /* An error, most probably.  */
1436       break;
1437     }
1438 }
1439
1440 /* Compare two tokens.  */
1441 int
1442 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
1443 {
1444   if (a->type == b->type && a->flags == b->flags)
1445     switch (TOKEN_SPELL (a))
1446       {
1447       default:                  /* Keep compiler happy.  */
1448       case SPELL_OPERATOR:
1449         return 1;
1450       case SPELL_NONE:
1451         return (a->type != CPP_MACRO_ARG || a->val.arg_no == b->val.arg_no);
1452       case SPELL_IDENT:
1453         return a->val.node == b->val.node;
1454       case SPELL_LITERAL:
1455         return (a->val.str.len == b->val.str.len
1456                 && !memcmp (a->val.str.text, b->val.str.text,
1457                             a->val.str.len));
1458       }
1459
1460   return 0;
1461 }
1462
1463 /* Returns nonzero if a space should be inserted to avoid an
1464    accidental token paste for output.  For simplicity, it is
1465    conservative, and occasionally advises a space where one is not
1466    needed, e.g. "." and ".2".  */
1467 int
1468 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
1469                  const cpp_token *token2)
1470 {
1471   enum cpp_ttype a = token1->type, b = token2->type;
1472   cppchar_t c;
1473
1474   if (token1->flags & NAMED_OP)
1475     a = CPP_NAME;
1476   if (token2->flags & NAMED_OP)
1477     b = CPP_NAME;
1478
1479   c = EOF;
1480   if (token2->flags & DIGRAPH)
1481     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
1482   else if (token_spellings[b].category == SPELL_OPERATOR)
1483     c = token_spellings[b].name[0];
1484
1485   /* Quickly get everything that can paste with an '='.  */
1486   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
1487     return 1;
1488
1489   switch (a)
1490     {
1491     case CPP_GREATER:   return c == '>';
1492     case CPP_LESS:      return c == '<' || c == '%' || c == ':';
1493     case CPP_PLUS:      return c == '+';
1494     case CPP_MINUS:     return c == '-' || c == '>';
1495     case CPP_DIV:       return c == '/' || c == '*'; /* Comments.  */
1496     case CPP_MOD:       return c == ':' || c == '>';
1497     case CPP_AND:       return c == '&';
1498     case CPP_OR:        return c == '|';
1499     case CPP_COLON:     return c == ':' || c == '>';
1500     case CPP_DEREF:     return c == '*';
1501     case CPP_DOT:       return c == '.' || c == '%' || b == CPP_NUMBER;
1502     case CPP_HASH:      return c == '#' || c == '%'; /* Digraph form.  */
1503     case CPP_NAME:      return ((b == CPP_NUMBER
1504                                  && name_p (pfile, &token2->val.str))
1505                                 || b == CPP_NAME
1506                                 || b == CPP_CHAR || b == CPP_STRING); /* L */
1507     case CPP_NUMBER:    return (b == CPP_NUMBER || b == CPP_NAME
1508                                 || c == '.' || c == '+' || c == '-');
1509                                       /* UCNs */
1510     case CPP_OTHER:     return ((token1->val.str.text[0] == '\\'
1511                                  && b == CPP_NAME)
1512                                 || (CPP_OPTION (pfile, objc)
1513                                     && token1->val.str.text[0] == '@'
1514                                     && (b == CPP_NAME || b == CPP_STRING)));
1515     default:            break;
1516     }
1517
1518   return 0;
1519 }
1520
1521 /* Output all the remaining tokens on the current line, and a newline
1522    character, to FP.  Leading whitespace is removed.  If there are
1523    macros, special token padding is not performed.  */
1524 void
1525 cpp_output_line (cpp_reader *pfile, FILE *fp)
1526 {
1527   const cpp_token *token;
1528
1529   token = cpp_get_token (pfile);
1530   while (token->type != CPP_EOF)
1531     {
1532       cpp_output_token (token, fp);
1533       token = cpp_get_token (pfile);
1534       if (token->flags & PREV_WHITE)
1535         putc (' ', fp);
1536     }
1537
1538   putc ('\n', fp);
1539 }
1540
1541 /* Memory buffers.  Changing these three constants can have a dramatic
1542    effect on performance.  The values here are reasonable defaults,
1543    but might be tuned.  If you adjust them, be sure to test across a
1544    range of uses of cpplib, including heavy nested function-like macro
1545    expansion.  Also check the change in peak memory usage (NJAMD is a
1546    good tool for this).  */
1547 #define MIN_BUFF_SIZE 8000
1548 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
1549 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
1550         (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
1551
1552 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
1553   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
1554 #endif
1555
1556 /* Create a new allocation buffer.  Place the control block at the end
1557    of the buffer, so that buffer overflows will cause immediate chaos.  */
1558 static _cpp_buff *
1559 new_buff (size_t len)
1560 {
1561   _cpp_buff *result;
1562   unsigned char *base;
1563
1564   if (len < MIN_BUFF_SIZE)
1565     len = MIN_BUFF_SIZE;
1566   len = CPP_ALIGN (len);
1567
1568   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
1569   result = (_cpp_buff *) (base + len);
1570   result->base = base;
1571   result->cur = base;
1572   result->limit = base + len;
1573   result->next = NULL;
1574   return result;
1575 }
1576
1577 /* Place a chain of unwanted allocation buffers on the free list.  */
1578 void
1579 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
1580 {
1581   _cpp_buff *end = buff;
1582
1583   while (end->next)
1584     end = end->next;
1585   end->next = pfile->free_buffs;
1586   pfile->free_buffs = buff;
1587 }
1588
1589 /* Return a free buffer of size at least MIN_SIZE.  */
1590 _cpp_buff *
1591 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
1592 {
1593   _cpp_buff *result, **p;
1594
1595   for (p = &pfile->free_buffs;; p = &(*p)->next)
1596     {
1597       size_t size;
1598
1599       if (*p == NULL)
1600         return new_buff (min_size);
1601       result = *p;
1602       size = result->limit - result->base;
1603       /* Return a buffer that's big enough, but don't waste one that's
1604          way too big.  */
1605       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
1606         break;
1607     }
1608
1609   *p = result->next;
1610   result->next = NULL;
1611   result->cur = result->base;
1612   return result;
1613 }
1614
1615 /* Creates a new buffer with enough space to hold the uncommitted
1616    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
1617    the excess bytes to the new buffer.  Chains the new buffer after
1618    BUFF, and returns the new buffer.  */
1619 _cpp_buff *
1620 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
1621 {
1622   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
1623   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
1624
1625   buff->next = new_buff;
1626   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
1627   return new_buff;
1628 }
1629
1630 /* Creates a new buffer with enough space to hold the uncommitted
1631    remaining bytes of the buffer pointed to by BUFF, and at least
1632    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
1633    Chains the new buffer before the buffer pointed to by BUFF, and
1634    updates the pointer to point to the new buffer.  */
1635 void
1636 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
1637 {
1638   _cpp_buff *new_buff, *old_buff = *pbuff;
1639   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
1640
1641   new_buff = _cpp_get_buff (pfile, size);
1642   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
1643   new_buff->next = old_buff;
1644   *pbuff = new_buff;
1645 }
1646
1647 /* Free a chain of buffers starting at BUFF.  */
1648 void
1649 _cpp_free_buff (_cpp_buff *buff)
1650 {
1651   _cpp_buff *next;
1652
1653   for (; buff; buff = next)
1654     {
1655       next = buff->next;
1656       free (buff->base);
1657     }
1658 }
1659
1660 /* Allocate permanent, unaligned storage of length LEN.  */
1661 unsigned char *
1662 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
1663 {
1664   _cpp_buff *buff = pfile->u_buff;
1665   unsigned char *result = buff->cur;
1666
1667   if (len > (size_t) (buff->limit - result))
1668     {
1669       buff = _cpp_get_buff (pfile, len);
1670       buff->next = pfile->u_buff;
1671       pfile->u_buff = buff;
1672       result = buff->cur;
1673     }
1674
1675   buff->cur = result + len;
1676   return result;
1677 }
1678
1679 /* Allocate permanent, unaligned storage of length LEN from a_buff.
1680    That buffer is used for growing allocations when saving macro
1681    replacement lists in a #define, and when parsing an answer to an
1682    assertion in #assert, #unassert or #if (and therefore possibly
1683    whilst expanding macros).  It therefore must not be used by any
1684    code that they might call: specifically the lexer and the guts of
1685    the macro expander.
1686
1687    All existing other uses clearly fit this restriction: storing
1688    registered pragmas during initialization.  */
1689 unsigned char *
1690 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
1691 {
1692   _cpp_buff *buff = pfile->a_buff;
1693   unsigned char *result = buff->cur;
1694
1695   if (len > (size_t) (buff->limit - result))
1696     {
1697       buff = _cpp_get_buff (pfile, len);
1698       buff->next = pfile->a_buff;
1699       pfile->a_buff = buff;
1700       result = buff->cur;
1701     }
1702
1703   buff->cur = result + len;
1704   return result;
1705 }
1706
1707 /* Say which field of TOK is in use.  */
1708
1709 enum cpp_token_fld_kind
1710 cpp_token_val_index (cpp_token *tok)
1711 {
1712   switch (TOKEN_SPELL (tok))
1713     {
1714     case SPELL_IDENT:
1715       return CPP_TOKEN_FLD_NODE;
1716     case SPELL_LITERAL:
1717       return CPP_TOKEN_FLD_STR;
1718     case SPELL_NONE:
1719       if (tok->type == CPP_MACRO_ARG)
1720         return CPP_TOKEN_FLD_ARG_NO;
1721       else if (tok->type == CPP_PADDING)
1722         return CPP_TOKEN_FLD_SOURCE;
1723       else if (tok->type == CPP_PRAGMA)
1724         return CPP_TOKEN_FLD_PRAGMA;
1725       /* else fall through */
1726     default:
1727       return CPP_TOKEN_FLD_NONE;
1728     }
1729 }