contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp

   1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements the NumericLiteralParser, CharLiteralParser, and
  11 // StringLiteralParser interfaces.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "clang/Lex/LiteralSupport.h"
  16 #include "clang/Lex/Preprocessor.h"
  17 #include "clang/Lex/LexDiagnostic.h"
  18 #include "clang/Basic/TargetInfo.h"
  19 #include "llvm/ADT/StringRef.h"
  20 #include "llvm/ADT/StringExtras.h"
  21 using namespace clang;
  22
  23 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
  24 /// not valid.
  25 static int HexDigitValue(char C) {
  26   if (C >= '0' && C <= '9') return C-'0';
  27   if (C >= 'a' && C <= 'f') return C-'a'+10;
  28   if (C >= 'A' && C <= 'F') return C-'A'+10;
  29   return -1;
  30 }
  31
  32 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
  33 /// either a character or a string literal.
  34 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
  35                                   const char *ThisTokEnd, bool &HadError,
  36                                   FullSourceLoc Loc, bool IsWide,
  37                                   Diagnostic *Diags, const TargetInfo &Target) {
  38   // Skip the '\' char.
  39   ++ThisTokBuf;
  40
  41   // We know that this character can't be off the end of the buffer, because
  42   // that would have been \", which would not have been the end of string.
  43   unsigned ResultChar = *ThisTokBuf++;
  44   switch (ResultChar) {
  45   // These map to themselves.
  46   case '\\': case '\'': case '"': case '?': break;
  47
  48     // These have fixed mappings.
  49   case 'a':
  50     // TODO: K&R: the meaning of '\\a' is different in traditional C
  51     ResultChar = 7;
  52     break;
  53   case 'b':
  54     ResultChar = 8;
  55     break;
  56   case 'e':
  57     if (Diags)
  58       Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
  59     ResultChar = 27;
  60     break;
  61   case 'E':
  62     if (Diags)
  63       Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
  64     ResultChar = 27;
  65     break;
  66   case 'f':
  67     ResultChar = 12;
  68     break;
  69   case 'n':
  70     ResultChar = 10;
  71     break;
  72   case 'r':
  73     ResultChar = 13;
  74     break;
  75   case 't':
  76     ResultChar = 9;
  77     break;
  78   case 'v':
  79     ResultChar = 11;
  80     break;
  81   case 'x': { // Hex escape.
  82     ResultChar = 0;
  83     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
  84       if (Diags)
  85         Diags->Report(Loc, diag::err_hex_escape_no_digits);
  86       HadError = 1;
  87       break;
  88     }
  89
  90     // Hex escapes are a maximal series of hex digits.
  91     bool Overflow = false;
  92     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
  93       int CharVal = HexDigitValue(ThisTokBuf[0]);
  94       if (CharVal == -1) break;
  95       // About to shift out a digit?
  96       Overflow |= (ResultChar & 0xF0000000) ? true : false;
  97       ResultChar <<= 4;
  98       ResultChar |= CharVal;
  99     }
 100
 101     // See if any bits will be truncated when evaluated as a character.
 102     unsigned CharWidth =
 103       IsWide ? Target.getWCharWidth() : Target.getCharWidth();
 104
 105     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 106       Overflow = true;
 107       ResultChar &= ~0U >> (32-CharWidth);
 108     }
 109
 110     // Check for overflow.
 111     if (Overflow && Diags)   // Too many digits to fit in
 112       Diags->Report(Loc, diag::warn_hex_escape_too_large);
 113     break;
 114   }
 115   case '0': case '1': case '2': case '3':
 116   case '4': case '5': case '6': case '7': {
 117     // Octal escapes.
 118     --ThisTokBuf;
 119     ResultChar = 0;
 120
 121     // Octal escapes are a series of octal digits with maximum length 3.
 122     // "\0123" is a two digit sequence equal to "\012" "3".
 123     unsigned NumDigits = 0;
 124     do {
 125       ResultChar <<= 3;
 126       ResultChar |= *ThisTokBuf++ - '0';
 127       ++NumDigits;
 128     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
 129              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
 130
 131     // Check for overflow.  Reject '\777', but not L'\777'.
 132     unsigned CharWidth =
 133       IsWide ? Target.getWCharWidth() : Target.getCharWidth();
 134
 135     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 136       if (Diags)
 137         Diags->Report(Loc, diag::warn_octal_escape_too_large);
 138       ResultChar &= ~0U >> (32-CharWidth);
 139     }
 140     break;
 141   }
 142
 143     // Otherwise, these are not valid escapes.
 144   case '(': case '{': case '[': case '%':
 145     // GCC accepts these as extensions.  We warn about them as such though.
 146     if (Diags)
 147       Diags->Report(Loc, diag::ext_nonstandard_escape)
 148         << std::string()+(char)ResultChar;
 149     break;
 150   default:
 151     if (Diags == 0)
 152       break;
 153
 154     if (isgraph(ResultChar))
 155       Diags->Report(Loc, diag::ext_unknown_escape)
 156         << std::string()+(char)ResultChar;
 157     else
 158       Diags->Report(Loc, diag::ext_unknown_escape)
 159         << "x"+llvm::utohexstr(ResultChar);
 160     break;
 161   }
 162
 163   return ResultChar;
 164 }
 165
 166 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
 167 /// return the UTF32.
 168 static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
 169                              uint32_t &UcnVal, unsigned short &UcnLen,
 170                              FullSourceLoc Loc, Diagnostic *Diags,
 171                              const LangOptions &Features) {
 172   if (!Features.CPlusPlus && !Features.C99 && Diags)
 173     Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
 174
 175   // Save the beginning of the string (for error diagnostics).
 176   const char *ThisTokBegin = ThisTokBuf;
 177
 178   // Skip the '\u' char's.
 179   ThisTokBuf += 2;
 180
 181   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
 182     if (Diags)
 183       Diags->Report(Loc, diag::err_ucn_escape_no_digits);
 184     return false;
 185   }
 186   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
 187   unsigned short UcnLenSave = UcnLen;
 188   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
 189     int CharVal = HexDigitValue(ThisTokBuf[0]);
 190     if (CharVal == -1) break;
 191     UcnVal <<= 4;
 192     UcnVal |= CharVal;
 193   }
 194   // If we didn't consume the proper number of digits, there is a problem.
 195   if (UcnLenSave) {
 196     if (Diags) {
 197       SourceLocation L =
 198         Lexer::AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin,
 199                                        Loc.getManager(), Features);
 200       Diags->Report(FullSourceLoc(L, Loc.getManager()),
 201                     diag::err_ucn_escape_incomplete);
 202     }
 203     return false;
 204   }
 205   // Check UCN constraints (C99 6.4.3p2).
 206   if ((UcnVal < 0xa0 &&
 207       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
 208       || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
 209       || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
 210     if (Diags)
 211       Diags->Report(Loc, diag::err_ucn_escape_invalid);
 212     return false;
 213   }
 214   return true;
 215 }
 216
 217 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
 218 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
 219 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
 220 /// we will likely rework our support for UCN's.
 221 static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
 222                             char *&ResultBuf, bool &HadError,
 223                             FullSourceLoc Loc, bool wide, Diagnostic *Diags,
 224                             const LangOptions &Features) {
 225   typedef uint32_t UTF32;
 226   UTF32 UcnVal = 0;
 227   unsigned short UcnLen = 0;
 228   if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags,
 229                         Features)) {
 230     HadError = 1;
 231     return;
 232   }
 233
 234   if (wide) {
 235     (void)UcnLen;
 236     assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
 237
 238     if (!Features.ShortWChar) {
 239       // Note: our internal rep of wide char tokens is always little-endian.
 240       *ResultBuf++ = (UcnVal & 0x000000FF);
 241       *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
 242       *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
 243       *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
 244       return;
 245     }
 246
 247     // Convert to UTF16.
 248     if (UcnVal < (UTF32)0xFFFF) {
 249       *ResultBuf++ = (UcnVal & 0x000000FF);
 250       *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
 251       return;
 252     }
 253     if (Diags) Diags->Report(Loc, diag::warn_ucn_escape_too_large);
 254
 255     typedef uint16_t UTF16;
 256     UcnVal -= 0x10000;
 257     UTF16 surrogate1 = 0xD800 + (UcnVal >> 10);
 258     UTF16 surrogate2 = 0xDC00 + (UcnVal & 0x3FF);
 259     *ResultBuf++ = (surrogate1 & 0x000000FF);
 260     *ResultBuf++ = (surrogate1 & 0x0000FF00) >> 8;
 261     *ResultBuf++ = (surrogate2 & 0x000000FF);
 262     *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
 263     return;
 264   }
 265   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
 266   // The conversion below was inspired by:
 267   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
 268   // First, we determine how many bytes the result will require.
 269   typedef uint8_t UTF8;
 270
 271   unsigned short bytesToWrite = 0;
 272   if (UcnVal < (UTF32)0x80)
 273     bytesToWrite = 1;
 274   else if (UcnVal < (UTF32)0x800)
 275     bytesToWrite = 2;
 276   else if (UcnVal < (UTF32)0x10000)
 277     bytesToWrite = 3;
 278   else
 279     bytesToWrite = 4;
 280
 281   const unsigned byteMask = 0xBF;
 282   const unsigned byteMark = 0x80;
 283
 284   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
 285   // into the first byte, depending on how many bytes follow.
 286   static const UTF8 firstByteMark[5] = {
 287     0x00, 0x00, 0xC0, 0xE0, 0xF0
 288   };
 289   // Finally, we write the bytes into ResultBuf.
 290   ResultBuf += bytesToWrite;
 291   switch (bytesToWrite) { // note: everything falls through.
 292     case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 293     case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 294     case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 295     case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
 296   }
 297   // Update the buffer.
 298   ResultBuf += bytesToWrite;
 299 }
 300
 301
 302 ///       integer-constant: [C99 6.4.4.1]
 303 ///         decimal-constant integer-suffix
 304 ///         octal-constant integer-suffix
 305 ///         hexadecimal-constant integer-suffix
 306 ///       decimal-constant:
 307 ///         nonzero-digit
 308 ///         decimal-constant digit
 309 ///       octal-constant:
 310 ///         0
 311 ///         octal-constant octal-digit
 312 ///       hexadecimal-constant:
 313 ///         hexadecimal-prefix hexadecimal-digit
 314 ///         hexadecimal-constant hexadecimal-digit
 315 ///       hexadecimal-prefix: one of
 316 ///         0x 0X
 317 ///       integer-suffix:
 318 ///         unsigned-suffix [long-suffix]
 319 ///         unsigned-suffix [long-long-suffix]
 320 ///         long-suffix [unsigned-suffix]
 321 ///         long-long-suffix [unsigned-sufix]
 322 ///       nonzero-digit:
 323 ///         1 2 3 4 5 6 7 8 9
 324 ///       octal-digit:
 325 ///         0 1 2 3 4 5 6 7
 326 ///       hexadecimal-digit:
 327 ///         0 1 2 3 4 5 6 7 8 9
 328 ///         a b c d e f
 329 ///         A B C D E F
 330 ///       unsigned-suffix: one of
 331 ///         u U
 332 ///       long-suffix: one of
 333 ///         l L
 334 ///       long-long-suffix: one of
 335 ///         ll LL
 336 ///
 337 ///       floating-constant: [C99 6.4.4.2]
 338 ///         TODO: add rules...
 339 ///
 340 NumericLiteralParser::
 341 NumericLiteralParser(const char *begin, const char *end,
 342                      SourceLocation TokLoc, Preprocessor &pp)
 343   : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
 344
 345   // This routine assumes that the range begin/end matches the regex for integer
 346   // and FP constants (specifically, the 'pp-number' regex), and assumes that
 347   // the byte at "*end" is both valid and not part of the regex.  Because of
 348   // this, it doesn't have to check for 'overscan' in various places.
 349   assert(!isalnum(*end) && *end != '.' && *end != '_' &&
 350          "Lexer didn't maximally munch?");
 351
 352   s = DigitsBegin = begin;
 353   saw_exponent = false;
 354   saw_period = false;
 355   isLong = false;
 356   isUnsigned = false;
 357   isLongLong = false;
 358   isFloat = false;
 359   isImaginary = false;
 360   isMicrosoftInteger = false;
 361   hadError = false;
 362
 363   if (*s == '0') { // parse radix
 364     ParseNumberStartingWithZero(TokLoc);
 365     if (hadError)
 366       return;
 367   } else { // the first digit is non-zero
 368     radix = 10;
 369     s = SkipDigits(s);
 370     if (s == ThisTokEnd) {
 371       // Done.
 372     } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
 373       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 374               diag::err_invalid_decimal_digit) << llvm::StringRef(s, 1);
 375       hadError = true;
 376       return;
 377     } else if (*s == '.') {
 378       s++;
 379       saw_period = true;
 380       s = SkipDigits(s);
 381     }
 382     if ((*s == 'e' || *s == 'E')) { // exponent
 383       const char *Exponent = s;
 384       s++;
 385       saw_exponent = true;
 386       if (*s == '+' || *s == '-')  s++; // sign
 387       const char *first_non_digit = SkipDigits(s);
 388       if (first_non_digit != s) {
 389         s = first_non_digit;
 390       } else {
 391         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
 392                 diag::err_exponent_has_no_digits);
 393         hadError = true;
 394         return;
 395       }
 396     }
 397   }
 398
 399   SuffixBegin = s;
 400
 401   // Parse the suffix.  At this point we can classify whether we have an FP or
 402   // integer constant.
 403   bool isFPConstant = isFloatingLiteral();
 404
 405   // Loop over all of the characters of the suffix.  If we see something bad,
 406   // we break out of the loop.
 407   for (; s != ThisTokEnd; ++s) {
 408     switch (*s) {
 409     case 'f':      // FP Suffix for "float"
 410     case 'F':
 411       if (!isFPConstant) break;  // Error for integer constant.
 412       if (isFloat || isLong) break; // FF, LF invalid.
 413       isFloat = true;
 414       continue;  // Success.
 415     case 'u':
 416     case 'U':
 417       if (isFPConstant) break;  // Error for floating constant.
 418       if (isUnsigned) break;    // Cannot be repeated.
 419       isUnsigned = true;
 420       continue;  // Success.
 421     case 'l':
 422     case 'L':
 423       if (isLong || isLongLong) break;  // Cannot be repeated.
 424       if (isFloat) break;               // LF invalid.
 425
 426       // Check for long long.  The L's need to be adjacent and the same case.
 427       if (s+1 != ThisTokEnd && s[1] == s[0]) {
 428         if (isFPConstant) break;        // long long invalid for floats.
 429         isLongLong = true;
 430         ++s;  // Eat both of them.
 431       } else {
 432         isLong = true;
 433       }
 434       continue;  // Success.
 435     case 'i':
 436     case 'I':
 437       if (PP.getLangOptions().Microsoft) {
 438         if (isFPConstant || isLong || isLongLong) break;
 439
 440         // Allow i8, i16, i32, i64, and i128.
 441         if (s + 1 != ThisTokEnd) {
 442           switch (s[1]) {
 443             case '8':
 444               s += 2; // i8 suffix
 445               isMicrosoftInteger = true;
 446               break;
 447             case '1':
 448               if (s + 2 == ThisTokEnd) break;
 449               if (s[2] == '6') {
 450                 s += 3; // i16 suffix
 451                 isMicrosoftInteger = true;
 452               }
 453               else if (s[2] == '2') {
 454                 if (s + 3 == ThisTokEnd) break;
 455                 if (s[3] == '8') {
 456                   s += 4; // i128 suffix
 457                   isMicrosoftInteger = true;
 458                 }
 459               }
 460               break;
 461             case '3':
 462               if (s + 2 == ThisTokEnd) break;
 463               if (s[2] == '2') {
 464                 s += 3; // i32 suffix
 465                 isLong = true;
 466                 isMicrosoftInteger = true;
 467               }
 468               break;
 469             case '6':
 470               if (s + 2 == ThisTokEnd) break;
 471               if (s[2] == '4') {
 472                 s += 3; // i64 suffix
 473                 isLongLong = true;
 474                 isMicrosoftInteger = true;
 475               }
 476               break;
 477             default:
 478               break;
 479           }
 480           break;
 481         }
 482       }
 483       // fall through.
 484     case 'j':
 485     case 'J':
 486       if (isImaginary) break;   // Cannot be repeated.
 487       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 488               diag::ext_imaginary_constant);
 489       isImaginary = true;
 490       continue;  // Success.
 491     }
 492     // If we reached here, there was an error.
 493     break;
 494   }
 495
 496   // Report an error if there are any.
 497   if (s != ThisTokEnd) {
 498     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 499             isFPConstant ? diag::err_invalid_suffix_float_constant :
 500                            diag::err_invalid_suffix_integer_constant)
 501       << llvm::StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
 502     hadError = true;
 503     return;
 504   }
 505 }
 506
 507 /// ParseNumberStartingWithZero - This method is called when the first character
 508 /// of the number is found to be a zero.  This means it is either an octal
 509 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
 510 /// a floating point number (01239.123e4).  Eat the prefix, determining the
 511 /// radix etc.
 512 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
 513   assert(s[0] == '0' && "Invalid method call");
 514   s++;
 515
 516   // Handle a hex number like 0x1234.
 517   if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
 518     s++;
 519     radix = 16;
 520     DigitsBegin = s;
 521     s = SkipHexDigits(s);
 522     if (s == ThisTokEnd) {
 523       // Done.
 524     } else if (*s == '.') {
 525       s++;
 526       saw_period = true;
 527       s = SkipHexDigits(s);
 528     }
 529     // A binary exponent can appear with or with a '.'. If dotted, the
 530     // binary exponent is required.
 531     if ((*s == 'p' || *s == 'P') && !PP.getLangOptions().CPlusPlus0x) {
 532       const char *Exponent = s;
 533       s++;
 534       saw_exponent = true;
 535       if (*s == '+' || *s == '-')  s++; // sign
 536       const char *first_non_digit = SkipDigits(s);
 537       if (first_non_digit == s) {
 538         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
 539                 diag::err_exponent_has_no_digits);
 540         hadError = true;
 541         return;
 542       }
 543       s = first_non_digit;
 544
 545       // In C++0x, we cannot support hexadecmial floating literals because
 546       // they conflict with user-defined literals, so we warn in previous
 547       // versions of C++ by default.
 548       if (PP.getLangOptions().CPlusPlus)
 549         PP.Diag(TokLoc, diag::ext_hexconstant_cplusplus);
 550       else if (!PP.getLangOptions().HexFloats)
 551         PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
 552     } else if (saw_period) {
 553       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 554               diag::err_hexconstant_requires_exponent);
 555       hadError = true;
 556     }
 557     return;
 558   }
 559
 560   // Handle simple binary numbers 0b01010
 561   if (*s == 'b' || *s == 'B') {
 562     // 0b101010 is a GCC extension.
 563     PP.Diag(TokLoc, diag::ext_binary_literal);
 564     ++s;
 565     radix = 2;
 566     DigitsBegin = s;
 567     s = SkipBinaryDigits(s);
 568     if (s == ThisTokEnd) {
 569       // Done.
 570     } else if (isxdigit(*s)) {
 571       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 572               diag::err_invalid_binary_digit) << llvm::StringRef(s, 1);
 573       hadError = true;
 574     }
 575     // Other suffixes will be diagnosed by the caller.
 576     return;
 577   }
 578
 579   // For now, the radix is set to 8. If we discover that we have a
 580   // floating point constant, the radix will change to 10. Octal floating
 581   // point constants are not permitted (only decimal and hexadecimal).
 582   radix = 8;
 583   DigitsBegin = s;
 584   s = SkipOctalDigits(s);
 585   if (s == ThisTokEnd)
 586     return; // Done, simple octal number like 01234
 587
 588   // If we have some other non-octal digit that *is* a decimal digit, see if
 589   // this is part of a floating point number like 094.123 or 09e1.
 590   if (isdigit(*s)) {
 591     const char *EndDecimal = SkipDigits(s);
 592     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
 593       s = EndDecimal;
 594       radix = 10;
 595     }
 596   }
 597
 598   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
 599   // the code is using an incorrect base.
 600   if (isxdigit(*s) && *s != 'e' && *s != 'E') {
 601     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 602             diag::err_invalid_octal_digit) << llvm::StringRef(s, 1);
 603     hadError = true;
 604     return;
 605   }
 606
 607   if (*s == '.') {
 608     s++;
 609     radix = 10;
 610     saw_period = true;
 611     s = SkipDigits(s); // Skip suffix.
 612   }
 613   if (*s == 'e' || *s == 'E') { // exponent
 614     const char *Exponent = s;
 615     s++;
 616     radix = 10;
 617     saw_exponent = true;
 618     if (*s == '+' || *s == '-')  s++; // sign
 619     const char *first_non_digit = SkipDigits(s);
 620     if (first_non_digit != s) {
 621       s = first_non_digit;
 622     } else {
 623       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
 624               diag::err_exponent_has_no_digits);
 625       hadError = true;
 626       return;
 627     }
 628   }
 629 }
 630
 631
 632 /// GetIntegerValue - Convert this numeric literal value to an APInt that
 633 /// matches Val's input width.  If there is an overflow, set Val to the low bits
 634 /// of the result and return true.  Otherwise, return false.
 635 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
 636   // Fast path: Compute a conservative bound on the maximum number of
 637   // bits per digit in this radix. If we can't possibly overflow a
 638   // uint64 based on that bound then do the simple conversion to
 639   // integer. This avoids the expensive overflow checking below, and
 640   // handles the common cases that matter (small decimal integers and
 641   // hex/octal values which don't overflow).
 642   unsigned MaxBitsPerDigit = 1;
 643   while ((1U << MaxBitsPerDigit) < radix)
 644     MaxBitsPerDigit += 1;
 645   if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
 646     uint64_t N = 0;
 647     for (s = DigitsBegin; s != SuffixBegin; ++s)
 648       N = N*radix + HexDigitValue(*s);
 649
 650     // This will truncate the value to Val's input width. Simply check
 651     // for overflow by comparing.
 652     Val = N;
 653     return Val.getZExtValue() != N;
 654   }
 655
 656   Val = 0;
 657   s = DigitsBegin;
 658
 659   llvm::APInt RadixVal(Val.getBitWidth(), radix);
 660   llvm::APInt CharVal(Val.getBitWidth(), 0);
 661   llvm::APInt OldVal = Val;
 662
 663   bool OverflowOccurred = false;
 664   while (s < SuffixBegin) {
 665     unsigned C = HexDigitValue(*s++);
 666
 667     // If this letter is out of bound for this radix, reject it.
 668     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
 669
 670     CharVal = C;
 671
 672     // Add the digit to the value in the appropriate radix.  If adding in digits
 673     // made the value smaller, then this overflowed.
 674     OldVal = Val;
 675
 676     // Multiply by radix, did overflow occur on the multiply?
 677     Val *= RadixVal;
 678     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
 679
 680     // Add value, did overflow occur on the value?
 681     //   (a + b) ult b  <=> overflow
 682     Val += CharVal;
 683     OverflowOccurred |= Val.ult(CharVal);
 684   }
 685   return OverflowOccurred;
 686 }
 687
 688 llvm::APFloat::opStatus
 689 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
 690   using llvm::APFloat;
 691   using llvm::StringRef;
 692
 693   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
 694   return Result.convertFromString(StringRef(ThisTokBegin, n),
 695                                   APFloat::rmNearestTiesToEven);
 696 }
 697
 698
 699 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
 700                                      SourceLocation Loc, Preprocessor &PP) {
 701   // At this point we know that the character matches the regex "L?'.*'".
 702   HadError = false;
 703
 704   // Determine if this is a wide character.
 705   IsWide = begin[0] == 'L';
 706   if (IsWide) ++begin;
 707
 708   // Skip over the entry quote.
 709   assert(begin[0] == '\'' && "Invalid token lexed");
 710   ++begin;
 711
 712   // FIXME: The "Value" is an uint64_t so we can handle char literals of
 713   // up to 64-bits.
 714   // FIXME: This extensively assumes that 'char' is 8-bits.
 715   assert(PP.getTargetInfo().getCharWidth() == 8 &&
 716          "Assumes char is 8 bits");
 717   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
 718          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
 719          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
 720   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
 721          "Assumes sizeof(wchar) on target is <= 64");
 722
 723   // This is what we will use for overflow detection
 724   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
 725
 726   unsigned NumCharsSoFar = 0;
 727   bool Warned = false;
 728   while (begin[0] != '\'') {
 729     uint64_t ResultChar;
 730
 731       // Is this a Universal Character Name escape?
 732     if (begin[0] != '\\')     // If this is a normal character, consume it.
 733       ResultChar = *begin++;
 734     else {                    // Otherwise, this is an escape character.
 735       // Check for UCN.
 736       if (begin[1] == 'u' || begin[1] == 'U') {
 737         uint32_t utf32 = 0;
 738         unsigned short UcnLen = 0;
 739         if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
 740                               FullSourceLoc(Loc, PP.getSourceManager()),
 741                               &PP.getDiagnostics(), PP.getLangOptions())) {
 742           HadError = 1;
 743         }
 744         ResultChar = utf32;
 745       } else {
 746         // Otherwise, this is a non-UCN escape character.  Process it.
 747         ResultChar = ProcessCharEscape(begin, end, HadError,
 748                                        FullSourceLoc(Loc,PP.getSourceManager()),
 749                                        IsWide,
 750                                        &PP.getDiagnostics(), PP.getTargetInfo());
 751       }
 752     }
 753
 754     // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
 755     // implementation defined (C99 6.4.4.4p10).
 756     if (NumCharsSoFar) {
 757       if (IsWide) {
 758         // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
 759         LitVal = 0;
 760       } else {
 761         // Narrow character literals act as though their value is concatenated
 762         // in this implementation, but warn on overflow.
 763         if (LitVal.countLeadingZeros() < 8 && !Warned) {
 764           PP.Diag(Loc, diag::warn_char_constant_too_large);
 765           Warned = true;
 766         }
 767         LitVal <<= 8;
 768       }
 769     }
 770
 771     LitVal = LitVal + ResultChar;
 772     ++NumCharsSoFar;
 773   }
 774
 775   // If this is the second character being processed, do special handling.
 776   if (NumCharsSoFar > 1) {
 777     // Warn about discarding the top bits for multi-char wide-character
 778     // constants (L'abcd').
 779     if (IsWide)
 780       PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
 781     else if (NumCharsSoFar != 4)
 782       PP.Diag(Loc, diag::ext_multichar_character_literal);
 783     else
 784       PP.Diag(Loc, diag::ext_four_char_character_literal);
 785     IsMultiChar = true;
 786   } else
 787     IsMultiChar = false;
 788
 789   // Transfer the value from APInt to uint64_t
 790   Value = LitVal.getZExtValue();
 791
 792   if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
 793     PP.Diag(Loc, diag::warn_ucn_escape_too_large);
 794
 795   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
 796   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
 797   // character constants are not sign extended in the this implementation:
 798   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
 799   if (!IsWide && NumCharsSoFar == 1 && (Value & 128) &&
 800       PP.getLangOptions().CharIsSigned)
 801     Value = (signed char)Value;
 802 }
 803
 804
 805 ///       string-literal: [C99 6.4.5]
 806 ///          " [s-char-sequence] "
 807 ///         L" [s-char-sequence] "
 808 ///       s-char-sequence:
 809 ///         s-char
 810 ///         s-char-sequence s-char
 811 ///       s-char:
 812 ///         any source character except the double quote ",
 813 ///           backslash \, or newline character
 814 ///         escape-character
 815 ///         universal-character-name
 816 ///       escape-character: [C99 6.4.4.4]
 817 ///         \ escape-code
 818 ///         universal-character-name
 819 ///       escape-code:
 820 ///         character-escape-code
 821 ///         octal-escape-code
 822 ///         hex-escape-code
 823 ///       character-escape-code: one of
 824 ///         n t b r f v a
 825 ///         \ ' " ?
 826 ///       octal-escape-code:
 827 ///         octal-digit
 828 ///         octal-digit octal-digit
 829 ///         octal-digit octal-digit octal-digit
 830 ///       hex-escape-code:
 831 ///         x hex-digit
 832 ///         hex-escape-code hex-digit
 833 ///       universal-character-name:
 834 ///         \u hex-quad
 835 ///         \U hex-quad hex-quad
 836 ///       hex-quad:
 837 ///         hex-digit hex-digit hex-digit hex-digit
 838 ///
 839 StringLiteralParser::
 840 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
 841                     Preprocessor &PP, bool Complain)
 842   : SM(PP.getSourceManager()), Features(PP.getLangOptions()),
 843     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
 844     MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0),
 845     ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) {
 846   init(StringToks, NumStringToks);
 847 }
 848
 849 void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
 850   // The literal token may have come from an invalid source location (e.g. due
 851   // to a PCH error), in which case the token length will be 0.
 852   if (NumStringToks == 0 || StringToks[0].getLength() < 2) {
 853     hadError = true;
 854     return;
 855   }
 856
 857   // Scan all of the string portions, remember the max individual token length,
 858   // computing a bound on the concatenated string length, and see whether any
 859   // piece is a wide-string.  If any of the string portions is a wide-string
 860   // literal, the result is a wide-string literal [C99 6.4.5p4].
 861   assert(NumStringToks && "expected at least one token");
 862   MaxTokenLength = StringToks[0].getLength();
 863   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
 864   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
 865   AnyWide = StringToks[0].is(tok::wide_string_literal);
 866
 867   hadError = false;
 868
 869   // Implement Translation Phase #6: concatenation of string literals
 870   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
 871   for (unsigned i = 1; i != NumStringToks; ++i) {
 872     if (StringToks[i].getLength() < 2) {
 873       hadError = true;
 874       return;
 875     }
 876
 877     // The string could be shorter than this if it needs cleaning, but this is a
 878     // reasonable bound, which is all we need.
 879     assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
 880     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
 881
 882     // Remember maximum string piece length.
 883     if (StringToks[i].getLength() > MaxTokenLength)
 884       MaxTokenLength = StringToks[i].getLength();
 885
 886     // Remember if we see any wide strings.
 887     AnyWide |= StringToks[i].is(tok::wide_string_literal);
 888   }
 889
 890   // Include space for the null terminator.
 891   ++SizeBound;
 892
 893   // TODO: K&R warning: "traditional C rejects string constant concatenation"
 894
 895   // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
 896   // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
 897   wchar_tByteWidth = ~0U;
 898   if (AnyWide) {
 899     wchar_tByteWidth = Target.getWCharWidth();
 900     assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
 901     wchar_tByteWidth /= 8;
 902   }
 903
 904   // The output buffer size needs to be large enough to hold wide characters.
 905   // This is a worst-case assumption which basically corresponds to L"" "long".
 906   if (AnyWide)
 907     SizeBound *= wchar_tByteWidth;
 908
 909   // Size the temporary buffer to hold the result string data.
 910   ResultBuf.resize(SizeBound);
 911
 912   // Likewise, but for each string piece.
 913   llvm::SmallString<512> TokenBuf;
 914   TokenBuf.resize(MaxTokenLength);
 915
 916   // Loop over all the strings, getting their spelling, and expanding them to
 917   // wide strings as appropriate.
 918   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
 919
 920   Pascal = false;
 921
 922   for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
 923     const char *ThisTokBuf = &TokenBuf[0];
 924     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
 925     // that ThisTokBuf points to a buffer that is big enough for the whole token
 926     // and 'spelled' tokens can only shrink.
 927     bool StringInvalid = false;
 928     unsigned ThisTokLen =
 929       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
 930                          &StringInvalid);
 931     if (StringInvalid) {
 932       hadError = 1;
 933       continue;
 934     }
 935
 936     const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
 937     bool wide = false;
 938     // TODO: Input character set mapping support.
 939
 940     // Skip L marker for wide strings.
 941     if (ThisTokBuf[0] == 'L') {
 942       wide = true;
 943       ++ThisTokBuf;
 944     }
 945
 946     assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
 947     ++ThisTokBuf;
 948
 949     // Check if this is a pascal string
 950     if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
 951         ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
 952
 953       // If the \p sequence is found in the first token, we have a pascal string
 954       // Otherwise, if we already have a pascal string, ignore the first \p
 955       if (i == 0) {
 956         ++ThisTokBuf;
 957         Pascal = true;
 958       } else if (Pascal)
 959         ThisTokBuf += 2;
 960     }
 961
 962     while (ThisTokBuf != ThisTokEnd) {
 963       // Is this a span of non-escape characters?
 964       if (ThisTokBuf[0] != '\\') {
 965         const char *InStart = ThisTokBuf;
 966         do {
 967           ++ThisTokBuf;
 968         } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
 969
 970         // Copy the character span over.
 971         unsigned Len = ThisTokBuf-InStart;
 972         if (!AnyWide) {
 973           memcpy(ResultPtr, InStart, Len);
 974           ResultPtr += Len;
 975         } else {
 976           // Note: our internal rep of wide char tokens is always little-endian.
 977           for (; Len; --Len, ++InStart) {
 978             *ResultPtr++ = InStart[0];
 979             // Add zeros at the end.
 980             for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
 981               *ResultPtr++ = 0;
 982           }
 983         }
 984         continue;
 985       }
 986       // Is this a Universal Character Name escape?
 987       if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
 988         EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
 989                         hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
 990                         wide, Diags, Features);
 991         continue;
 992       }
 993       // Otherwise, this is a non-UCN escape character.  Process it.
 994       unsigned ResultChar =
 995         ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
 996                           FullSourceLoc(StringToks[i].getLocation(), SM),
 997                           AnyWide, Diags, Target);
 998
 999       // Note: our internal rep of wide char tokens is always little-endian.
1000       *ResultPtr++ = ResultChar & 0xFF;
1001
1002       if (AnyWide) {
1003         for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
1004           *ResultPtr++ = ResultChar >> i*8;
1005       }
1006     }
1007   }
1008
1009   if (Pascal) {
1010     ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
1011     if (AnyWide)
1012       ResultBuf[0] /= wchar_tByteWidth;
1013
1014     // Verify that pascal strings aren't too large.
1015     if (GetStringLength() > 256) {
1016       if (Diags)
1017         Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
1018                       diag::err_pascal_string_too_long)
1019           << SourceRange(StringToks[0].getLocation(),
1020                          StringToks[NumStringToks-1].getLocation());
1021       hadError = 1;
1022       return;
1023     }
1024   } else if (Diags) {
1025     // Complain if this string literal has too many characters.
1026     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1027
1028     if (GetNumStringChars() > MaxChars)
1029       Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
1030                     diag::ext_string_too_long)
1031         << GetNumStringChars() << MaxChars
1032         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1033         << SourceRange(StringToks[0].getLocation(),
1034                        StringToks[NumStringToks-1].getLocation());
1035   }
1036 }
1037
1038
1039 /// getOffsetOfStringByte - This function returns the offset of the
1040 /// specified byte of the string data represented by Token.  This handles
1041 /// advancing over escape sequences in the string.
1042 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1043                                                     unsigned ByteNo) const {
1044   // Get the spelling of the token.
1045   llvm::SmallString<32> SpellingBuffer;
1046   SpellingBuffer.resize(Tok.getLength());
1047
1048   bool StringInvalid = false;
1049   const char *SpellingPtr = &SpellingBuffer[0];
1050   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1051                                        &StringInvalid);
1052   if (StringInvalid)
1053     return 0;
1054
1055   assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet");
1056
1057
1058   const char *SpellingStart = SpellingPtr;
1059   const char *SpellingEnd = SpellingPtr+TokLen;
1060
1061   // Skip over the leading quote.
1062   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1063   ++SpellingPtr;
1064
1065   // Skip over bytes until we find the offset we're looking for.
1066   while (ByteNo) {
1067     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1068
1069     // Step over non-escapes simply.
1070     if (*SpellingPtr != '\\') {
1071       ++SpellingPtr;
1072       --ByteNo;
1073       continue;
1074     }
1075
1076     // Otherwise, this is an escape character.  Advance over it.
1077     bool HadError = false;
1078     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
1079                       FullSourceLoc(Tok.getLocation(), SM),
1080                       false, Diags, Target);
1081     assert(!HadError && "This method isn't valid on erroneous strings");
1082     --ByteNo;
1083   }
1084
1085   return SpellingPtr-SpellingStart;
1086 }