contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp

   1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements the NumericLiteralParser, CharLiteralParser, and
  11 // StringLiteralParser interfaces.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "clang/Lex/LiteralSupport.h"
  16 #include "clang/Lex/Preprocessor.h"
  17 #include "clang/Lex/LexDiagnostic.h"
  18 #include "clang/Basic/TargetInfo.h"
  19 #include "llvm/ADT/StringRef.h"
  20 #include "llvm/ADT/StringExtras.h"
  21 using namespace clang;
  22
  23 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
  24 /// not valid.
  25 static int HexDigitValue(char C) {
  26   if (C >= '0' && C <= '9') return C-'0';
  27   if (C >= 'a' && C <= 'f') return C-'a'+10;
  28   if (C >= 'A' && C <= 'F') return C-'A'+10;
  29   return -1;
  30 }
  31
  32 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
  33 /// either a character or a string literal.
  34 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
  35                                   const char *ThisTokEnd, bool &HadError,
  36                                   SourceLocation Loc, bool IsWide,
  37                                   Preprocessor &PP, bool Complain) {
  38   // Skip the '\' char.
  39   ++ThisTokBuf;
  40
  41   // We know that this character can't be off the end of the buffer, because
  42   // that would have been \", which would not have been the end of string.
  43   unsigned ResultChar = *ThisTokBuf++;
  44   switch (ResultChar) {
  45   // These map to themselves.
  46   case '\\': case '\'': case '"': case '?': break;
  47
  48     // These have fixed mappings.
  49   case 'a':
  50     // TODO: K&R: the meaning of '\\a' is different in traditional C
  51     ResultChar = 7;
  52     break;
  53   case 'b':
  54     ResultChar = 8;
  55     break;
  56   case 'e':
  57     if (Complain)
  58       PP.Diag(Loc, diag::ext_nonstandard_escape) << "e";
  59     ResultChar = 27;
  60     break;
  61   case 'E':
  62     if (Complain)
  63       PP.Diag(Loc, diag::ext_nonstandard_escape) << "E";
  64     ResultChar = 27;
  65     break;
  66   case 'f':
  67     ResultChar = 12;
  68     break;
  69   case 'n':
  70     ResultChar = 10;
  71     break;
  72   case 'r':
  73     ResultChar = 13;
  74     break;
  75   case 't':
  76     ResultChar = 9;
  77     break;
  78   case 'v':
  79     ResultChar = 11;
  80     break;
  81   case 'x': { // Hex escape.
  82     ResultChar = 0;
  83     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
  84       if (Complain)
  85         PP.Diag(Loc, diag::err_hex_escape_no_digits);
  86       HadError = 1;
  87       break;
  88     }
  89
  90     // Hex escapes are a maximal series of hex digits.
  91     bool Overflow = false;
  92     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
  93       int CharVal = HexDigitValue(ThisTokBuf[0]);
  94       if (CharVal == -1) break;
  95       // About to shift out a digit?
  96       Overflow |= (ResultChar & 0xF0000000) ? true : false;
  97       ResultChar <<= 4;
  98       ResultChar |= CharVal;
  99     }
 100
 101     // See if any bits will be truncated when evaluated as a character.
 102     unsigned CharWidth = IsWide
 103                        ? PP.getTargetInfo().getWCharWidth()
 104                        : PP.getTargetInfo().getCharWidth();
 105
 106     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 107       Overflow = true;
 108       ResultChar &= ~0U >> (32-CharWidth);
 109     }
 110
 111     // Check for overflow.
 112     if (Overflow && Complain)   // Too many digits to fit in
 113       PP.Diag(Loc, diag::warn_hex_escape_too_large);
 114     break;
 115   }
 116   case '0': case '1': case '2': case '3':
 117   case '4': case '5': case '6': case '7': {
 118     // Octal escapes.
 119     --ThisTokBuf;
 120     ResultChar = 0;
 121
 122     // Octal escapes are a series of octal digits with maximum length 3.
 123     // "\0123" is a two digit sequence equal to "\012" "3".
 124     unsigned NumDigits = 0;
 125     do {
 126       ResultChar <<= 3;
 127       ResultChar |= *ThisTokBuf++ - '0';
 128       ++NumDigits;
 129     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
 130              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
 131
 132     // Check for overflow.  Reject '\777', but not L'\777'.
 133     unsigned CharWidth = IsWide
 134                        ? PP.getTargetInfo().getWCharWidth()
 135                        : PP.getTargetInfo().getCharWidth();
 136
 137     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 138       if (Complain)
 139         PP.Diag(Loc, diag::warn_octal_escape_too_large);
 140       ResultChar &= ~0U >> (32-CharWidth);
 141     }
 142     break;
 143   }
 144
 145     // Otherwise, these are not valid escapes.
 146   case '(': case '{': case '[': case '%':
 147     // GCC accepts these as extensions.  We warn about them as such though.
 148     if (Complain)
 149       PP.Diag(Loc, diag::ext_nonstandard_escape)
 150         << std::string()+(char)ResultChar;
 151     break;
 152   default:
 153     if (!Complain)
 154       break;
 155
 156     if (isgraph(ThisTokBuf[0]))
 157       PP.Diag(Loc, diag::ext_unknown_escape) << std::string()+(char)ResultChar;
 158     else
 159       PP.Diag(Loc, diag::ext_unknown_escape) << "x"+llvm::utohexstr(ResultChar);
 160     break;
 161   }
 162
 163   return ResultChar;
 164 }
 165
 166 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
 167 /// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
 168 /// When we decide to implement UCN's for character constants and identifiers,
 169 /// we will likely rework our support for UCN's.
 170 static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
 171                              char *&ResultBuf, bool &HadError,
 172                              SourceLocation Loc, bool IsWide, Preprocessor &PP,
 173                              bool Complain)
 174 {
 175   // FIXME: Add a warning - UCN's are only valid in C++ & C99.
 176   // FIXME: Handle wide strings.
 177
 178   // Save the beginning of the string (for error diagnostics).
 179   const char *ThisTokBegin = ThisTokBuf;
 180
 181   // Skip the '\u' char's.
 182   ThisTokBuf += 2;
 183
 184   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
 185     if (Complain)
 186       PP.Diag(Loc, diag::err_ucn_escape_no_digits);
 187     HadError = 1;
 188     return;
 189   }
 190   typedef uint32_t UTF32;
 191
 192   UTF32 UcnVal = 0;
 193   unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
 194   for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
 195     int CharVal = HexDigitValue(ThisTokBuf[0]);
 196     if (CharVal == -1) break;
 197     UcnVal <<= 4;
 198     UcnVal |= CharVal;
 199   }
 200   // If we didn't consume the proper number of digits, there is a problem.
 201   if (UcnLen) {
 202     if (Complain)
 203       PP.Diag(PP.AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin),
 204               diag::err_ucn_escape_incomplete);
 205     HadError = 1;
 206     return;
 207   }
 208   // Check UCN constraints (C99 6.4.3p2).
 209   if ((UcnVal < 0xa0 &&
 210       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
 211       || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
 212       || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
 213     if (Complain)
 214       PP.Diag(Loc, diag::err_ucn_escape_invalid);
 215     HadError = 1;
 216     return;
 217   }
 218   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
 219   // The conversion below was inspired by:
 220   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
 221   // First, we determine how many bytes the result will require.
 222   typedef uint8_t UTF8;
 223
 224   unsigned short bytesToWrite = 0;
 225   if (UcnVal < (UTF32)0x80)
 226     bytesToWrite = 1;
 227   else if (UcnVal < (UTF32)0x800)
 228     bytesToWrite = 2;
 229   else if (UcnVal < (UTF32)0x10000)
 230     bytesToWrite = 3;
 231   else
 232     bytesToWrite = 4;
 233
 234   const unsigned byteMask = 0xBF;
 235   const unsigned byteMark = 0x80;
 236
 237   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
 238   // into the first byte, depending on how many bytes follow.
 239   static const UTF8 firstByteMark[5] = {
 240     0x00, 0x00, 0xC0, 0xE0, 0xF0
 241   };
 242   // Finally, we write the bytes into ResultBuf.
 243   ResultBuf += bytesToWrite;
 244   switch (bytesToWrite) { // note: everything falls through.
 245     case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 246     case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 247     case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 248     case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
 249   }
 250   // Update the buffer.
 251   ResultBuf += bytesToWrite;
 252 }
 253
 254
 255 ///       integer-constant: [C99 6.4.4.1]
 256 ///         decimal-constant integer-suffix
 257 ///         octal-constant integer-suffix
 258 ///         hexadecimal-constant integer-suffix
 259 ///       decimal-constant:
 260 ///         nonzero-digit
 261 ///         decimal-constant digit
 262 ///       octal-constant:
 263 ///         0
 264 ///         octal-constant octal-digit
 265 ///       hexadecimal-constant:
 266 ///         hexadecimal-prefix hexadecimal-digit
 267 ///         hexadecimal-constant hexadecimal-digit
 268 ///       hexadecimal-prefix: one of
 269 ///         0x 0X
 270 ///       integer-suffix:
 271 ///         unsigned-suffix [long-suffix]
 272 ///         unsigned-suffix [long-long-suffix]
 273 ///         long-suffix [unsigned-suffix]
 274 ///         long-long-suffix [unsigned-sufix]
 275 ///       nonzero-digit:
 276 ///         1 2 3 4 5 6 7 8 9
 277 ///       octal-digit:
 278 ///         0 1 2 3 4 5 6 7
 279 ///       hexadecimal-digit:
 280 ///         0 1 2 3 4 5 6 7 8 9
 281 ///         a b c d e f
 282 ///         A B C D E F
 283 ///       unsigned-suffix: one of
 284 ///         u U
 285 ///       long-suffix: one of
 286 ///         l L
 287 ///       long-long-suffix: one of
 288 ///         ll LL
 289 ///
 290 ///       floating-constant: [C99 6.4.4.2]
 291 ///         TODO: add rules...
 292 ///
 293 NumericLiteralParser::
 294 NumericLiteralParser(const char *begin, const char *end,
 295                      SourceLocation TokLoc, Preprocessor &pp)
 296   : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
 297
 298   // This routine assumes that the range begin/end matches the regex for integer
 299   // and FP constants (specifically, the 'pp-number' regex), and assumes that
 300   // the byte at "*end" is both valid and not part of the regex.  Because of
 301   // this, it doesn't have to check for 'overscan' in various places.
 302   assert(!isalnum(*end) && *end != '.' && *end != '_' &&
 303          "Lexer didn't maximally munch?");
 304
 305   s = DigitsBegin = begin;
 306   saw_exponent = false;
 307   saw_period = false;
 308   isLong = false;
 309   isUnsigned = false;
 310   isLongLong = false;
 311   isFloat = false;
 312   isImaginary = false;
 313   isMicrosoftInteger = false;
 314   hadError = false;
 315
 316   if (*s == '0') { // parse radix
 317     ParseNumberStartingWithZero(TokLoc);
 318     if (hadError)
 319       return;
 320   } else { // the first digit is non-zero
 321     radix = 10;
 322     s = SkipDigits(s);
 323     if (s == ThisTokEnd) {
 324       // Done.
 325     } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
 326       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 327               diag::err_invalid_decimal_digit) << std::string(s, s+1);
 328       hadError = true;
 329       return;
 330     } else if (*s == '.') {
 331       s++;
 332       saw_period = true;
 333       s = SkipDigits(s);
 334     }
 335     if ((*s == 'e' || *s == 'E')) { // exponent
 336       const char *Exponent = s;
 337       s++;
 338       saw_exponent = true;
 339       if (*s == '+' || *s == '-')  s++; // sign
 340       const char *first_non_digit = SkipDigits(s);
 341       if (first_non_digit != s) {
 342         s = first_non_digit;
 343       } else {
 344         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
 345                 diag::err_exponent_has_no_digits);
 346         hadError = true;
 347         return;
 348       }
 349     }
 350   }
 351
 352   SuffixBegin = s;
 353
 354   // Parse the suffix.  At this point we can classify whether we have an FP or
 355   // integer constant.
 356   bool isFPConstant = isFloatingLiteral();
 357
 358   // Loop over all of the characters of the suffix.  If we see something bad,
 359   // we break out of the loop.
 360   for (; s != ThisTokEnd; ++s) {
 361     switch (*s) {
 362     case 'f':      // FP Suffix for "float"
 363     case 'F':
 364       if (!isFPConstant) break;  // Error for integer constant.
 365       if (isFloat || isLong) break; // FF, LF invalid.
 366       isFloat = true;
 367       continue;  // Success.
 368     case 'u':
 369     case 'U':
 370       if (isFPConstant) break;  // Error for floating constant.
 371       if (isUnsigned) break;    // Cannot be repeated.
 372       isUnsigned = true;
 373       continue;  // Success.
 374     case 'l':
 375     case 'L':
 376       if (isLong || isLongLong) break;  // Cannot be repeated.
 377       if (isFloat) break;               // LF invalid.
 378
 379       // Check for long long.  The L's need to be adjacent and the same case.
 380       if (s+1 != ThisTokEnd && s[1] == s[0]) {
 381         if (isFPConstant) break;        // long long invalid for floats.
 382         isLongLong = true;
 383         ++s;  // Eat both of them.
 384       } else {
 385         isLong = true;
 386       }
 387       continue;  // Success.
 388     case 'i':
 389       if (PP.getLangOptions().Microsoft) {
 390         if (isFPConstant || isLong || isLongLong) break;
 391
 392         // Allow i8, i16, i32, i64, and i128.
 393         if (s + 1 != ThisTokEnd) {
 394           switch (s[1]) {
 395             case '8':
 396               s += 2; // i8 suffix
 397               isMicrosoftInteger = true;
 398               break;
 399             case '1':
 400               if (s + 2 == ThisTokEnd) break;
 401               if (s[2] == '6') s += 3; // i16 suffix
 402               else if (s[2] == '2') {
 403                 if (s + 3 == ThisTokEnd) break;
 404                 if (s[3] == '8') s += 4; // i128 suffix
 405               }
 406               isMicrosoftInteger = true;
 407               break;
 408             case '3':
 409               if (s + 2 == ThisTokEnd) break;
 410               if (s[2] == '2') s += 3; // i32 suffix
 411               isMicrosoftInteger = true;
 412               break;
 413             case '6':
 414               if (s + 2 == ThisTokEnd) break;
 415               if (s[2] == '4') s += 3; // i64 suffix
 416               isMicrosoftInteger = true;
 417               break;
 418             default:
 419               break;
 420           }
 421           break;
 422         }
 423       }
 424       // fall through.
 425     case 'I':
 426     case 'j':
 427     case 'J':
 428       if (isImaginary) break;   // Cannot be repeated.
 429       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 430               diag::ext_imaginary_constant);
 431       isImaginary = true;
 432       continue;  // Success.
 433     }
 434     // If we reached here, there was an error.
 435     break;
 436   }
 437
 438   // Report an error if there are any.
 439   if (s != ThisTokEnd) {
 440     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 441             isFPConstant ? diag::err_invalid_suffix_float_constant :
 442                            diag::err_invalid_suffix_integer_constant)
 443       << std::string(SuffixBegin, ThisTokEnd);
 444     hadError = true;
 445     return;
 446   }
 447 }
 448
 449 /// ParseNumberStartingWithZero - This method is called when the first character
 450 /// of the number is found to be a zero.  This means it is either an octal
 451 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
 452 /// a floating point number (01239.123e4).  Eat the prefix, determining the
 453 /// radix etc.
 454 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
 455   assert(s[0] == '0' && "Invalid method call");
 456   s++;
 457
 458   // Handle a hex number like 0x1234.
 459   if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
 460     s++;
 461     radix = 16;
 462     DigitsBegin = s;
 463     s = SkipHexDigits(s);
 464     if (s == ThisTokEnd) {
 465       // Done.
 466     } else if (*s == '.') {
 467       s++;
 468       saw_period = true;
 469       s = SkipHexDigits(s);
 470     }
 471     // A binary exponent can appear with or with a '.'. If dotted, the
 472     // binary exponent is required.
 473     if ((*s == 'p' || *s == 'P') && !PP.getLangOptions().CPlusPlus0x) {
 474       const char *Exponent = s;
 475       s++;
 476       saw_exponent = true;
 477       if (*s == '+' || *s == '-')  s++; // sign
 478       const char *first_non_digit = SkipDigits(s);
 479       if (first_non_digit == s) {
 480         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
 481                 diag::err_exponent_has_no_digits);
 482         hadError = true;
 483         return;
 484       }
 485       s = first_non_digit;
 486
 487       // In C++0x, we cannot support hexadecmial floating literals because
 488       // they conflict with user-defined literals, so we warn in previous
 489       // versions of C++ by default.
 490       if (PP.getLangOptions().CPlusPlus)
 491         PP.Diag(TokLoc, diag::ext_hexconstant_cplusplus);
 492       else if (!PP.getLangOptions().HexFloats)
 493         PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
 494     } else if (saw_period) {
 495       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 496               diag::err_hexconstant_requires_exponent);
 497       hadError = true;
 498     }
 499     return;
 500   }
 501
 502   // Handle simple binary numbers 0b01010
 503   if (*s == 'b' || *s == 'B') {
 504     // 0b101010 is a GCC extension.
 505     PP.Diag(TokLoc, diag::ext_binary_literal);
 506     ++s;
 507     radix = 2;
 508     DigitsBegin = s;
 509     s = SkipBinaryDigits(s);
 510     if (s == ThisTokEnd) {
 511       // Done.
 512     } else if (isxdigit(*s)) {
 513       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 514               diag::err_invalid_binary_digit) << std::string(s, s+1);
 515       hadError = true;
 516     }
 517     // Other suffixes will be diagnosed by the caller.
 518     return;
 519   }
 520
 521   // For now, the radix is set to 8. If we discover that we have a
 522   // floating point constant, the radix will change to 10. Octal floating
 523   // point constants are not permitted (only decimal and hexadecimal).
 524   radix = 8;
 525   DigitsBegin = s;
 526   s = SkipOctalDigits(s);
 527   if (s == ThisTokEnd)
 528     return; // Done, simple octal number like 01234
 529
 530   // If we have some other non-octal digit that *is* a decimal digit, see if
 531   // this is part of a floating point number like 094.123 or 09e1.
 532   if (isdigit(*s)) {
 533     const char *EndDecimal = SkipDigits(s);
 534     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
 535       s = EndDecimal;
 536       radix = 10;
 537     }
 538   }
 539
 540   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
 541   // the code is using an incorrect base.
 542   if (isxdigit(*s) && *s != 'e' && *s != 'E') {
 543     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 544             diag::err_invalid_octal_digit) << std::string(s, s+1);
 545     hadError = true;
 546     return;
 547   }
 548
 549   if (*s == '.') {
 550     s++;
 551     radix = 10;
 552     saw_period = true;
 553     s = SkipDigits(s); // Skip suffix.
 554   }
 555   if (*s == 'e' || *s == 'E') { // exponent
 556     const char *Exponent = s;
 557     s++;
 558     radix = 10;
 559     saw_exponent = true;
 560     if (*s == '+' || *s == '-')  s++; // sign
 561     const char *first_non_digit = SkipDigits(s);
 562     if (first_non_digit != s) {
 563       s = first_non_digit;
 564     } else {
 565       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
 566               diag::err_exponent_has_no_digits);
 567       hadError = true;
 568       return;
 569     }
 570   }
 571 }
 572
 573
 574 /// GetIntegerValue - Convert this numeric literal value to an APInt that
 575 /// matches Val's input width.  If there is an overflow, set Val to the low bits
 576 /// of the result and return true.  Otherwise, return false.
 577 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
 578   // Fast path: Compute a conservative bound on the maximum number of
 579   // bits per digit in this radix. If we can't possibly overflow a
 580   // uint64 based on that bound then do the simple conversion to
 581   // integer. This avoids the expensive overflow checking below, and
 582   // handles the common cases that matter (small decimal integers and
 583   // hex/octal values which don't overflow).
 584   unsigned MaxBitsPerDigit = 1;
 585   while ((1U << MaxBitsPerDigit) < radix)
 586     MaxBitsPerDigit += 1;
 587   if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
 588     uint64_t N = 0;
 589     for (s = DigitsBegin; s != SuffixBegin; ++s)
 590       N = N*radix + HexDigitValue(*s);
 591
 592     // This will truncate the value to Val's input width. Simply check
 593     // for overflow by comparing.
 594     Val = N;
 595     return Val.getZExtValue() != N;
 596   }
 597
 598   Val = 0;
 599   s = DigitsBegin;
 600
 601   llvm::APInt RadixVal(Val.getBitWidth(), radix);
 602   llvm::APInt CharVal(Val.getBitWidth(), 0);
 603   llvm::APInt OldVal = Val;
 604
 605   bool OverflowOccurred = false;
 606   while (s < SuffixBegin) {
 607     unsigned C = HexDigitValue(*s++);
 608
 609     // If this letter is out of bound for this radix, reject it.
 610     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
 611
 612     CharVal = C;
 613
 614     // Add the digit to the value in the appropriate radix.  If adding in digits
 615     // made the value smaller, then this overflowed.
 616     OldVal = Val;
 617
 618     // Multiply by radix, did overflow occur on the multiply?
 619     Val *= RadixVal;
 620     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
 621
 622     // Add value, did overflow occur on the value?
 623     //   (a + b) ult b  <=> overflow
 624     Val += CharVal;
 625     OverflowOccurred |= Val.ult(CharVal);
 626   }
 627   return OverflowOccurred;
 628 }
 629
 630 llvm::APFloat::opStatus
 631 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
 632   using llvm::APFloat;
 633   using llvm::StringRef;
 634
 635   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
 636   return Result.convertFromString(StringRef(ThisTokBegin, n),
 637                                   APFloat::rmNearestTiesToEven);
 638 }
 639
 640
 641 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
 642                                      SourceLocation Loc, Preprocessor &PP) {
 643   // At this point we know that the character matches the regex "L?'.*'".
 644   HadError = false;
 645
 646   // Determine if this is a wide character.
 647   IsWide = begin[0] == 'L';
 648   if (IsWide) ++begin;
 649
 650   // Skip over the entry quote.
 651   assert(begin[0] == '\'' && "Invalid token lexed");
 652   ++begin;
 653
 654   // FIXME: The "Value" is an uint64_t so we can handle char literals of
 655   // upto 64-bits.
 656   // FIXME: This extensively assumes that 'char' is 8-bits.
 657   assert(PP.getTargetInfo().getCharWidth() == 8 &&
 658          "Assumes char is 8 bits");
 659   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
 660          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
 661          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
 662   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
 663          "Assumes sizeof(wchar) on target is <= 64");
 664
 665   // This is what we will use for overflow detection
 666   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
 667
 668   unsigned NumCharsSoFar = 0;
 669   bool Warned = false;
 670   while (begin[0] != '\'') {
 671     uint64_t ResultChar;
 672     if (begin[0] != '\\')     // If this is a normal character, consume it.
 673       ResultChar = *begin++;
 674     else                      // Otherwise, this is an escape character.
 675       ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP,
 676                                      /*Complain=*/true);
 677
 678     // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
 679     // implementation defined (C99 6.4.4.4p10).
 680     if (NumCharsSoFar) {
 681       if (IsWide) {
 682         // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
 683         LitVal = 0;
 684       } else {
 685         // Narrow character literals act as though their value is concatenated
 686         // in this implementation, but warn on overflow.
 687         if (LitVal.countLeadingZeros() < 8 && !Warned) {
 688           PP.Diag(Loc, diag::warn_char_constant_too_large);
 689           Warned = true;
 690         }
 691         LitVal <<= 8;
 692       }
 693     }
 694
 695     LitVal = LitVal + ResultChar;
 696     ++NumCharsSoFar;
 697   }
 698
 699   // If this is the second character being processed, do special handling.
 700   if (NumCharsSoFar > 1) {
 701     // Warn about discarding the top bits for multi-char wide-character
 702     // constants (L'abcd').
 703     if (IsWide)
 704       PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
 705     else if (NumCharsSoFar != 4)
 706       PP.Diag(Loc, diag::ext_multichar_character_literal);
 707     else
 708       PP.Diag(Loc, diag::ext_four_char_character_literal);
 709     IsMultiChar = true;
 710   } else
 711     IsMultiChar = false;
 712
 713   // Transfer the value from APInt to uint64_t
 714   Value = LitVal.getZExtValue();
 715
 716   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
 717   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
 718   // character constants are not sign extended in the this implementation:
 719   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
 720   if (!IsWide && NumCharsSoFar == 1 && (Value & 128) &&
 721       PP.getLangOptions().CharIsSigned)
 722     Value = (signed char)Value;
 723 }
 724
 725
 726 ///       string-literal: [C99 6.4.5]
 727 ///          " [s-char-sequence] "
 728 ///         L" [s-char-sequence] "
 729 ///       s-char-sequence:
 730 ///         s-char
 731 ///         s-char-sequence s-char
 732 ///       s-char:
 733 ///         any source character except the double quote ",
 734 ///           backslash \, or newline character
 735 ///         escape-character
 736 ///         universal-character-name
 737 ///       escape-character: [C99 6.4.4.4]
 738 ///         \ escape-code
 739 ///         universal-character-name
 740 ///       escape-code:
 741 ///         character-escape-code
 742 ///         octal-escape-code
 743 ///         hex-escape-code
 744 ///       character-escape-code: one of
 745 ///         n t b r f v a
 746 ///         \ ' " ?
 747 ///       octal-escape-code:
 748 ///         octal-digit
 749 ///         octal-digit octal-digit
 750 ///         octal-digit octal-digit octal-digit
 751 ///       hex-escape-code:
 752 ///         x hex-digit
 753 ///         hex-escape-code hex-digit
 754 ///       universal-character-name:
 755 ///         \u hex-quad
 756 ///         \U hex-quad hex-quad
 757 ///       hex-quad:
 758 ///         hex-digit hex-digit hex-digit hex-digit
 759 ///
 760 StringLiteralParser::
 761 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
 762                     Preprocessor &pp, bool Complain) : PP(pp) {
 763   // Scan all of the string portions, remember the max individual token length,
 764   // computing a bound on the concatenated string length, and see whether any
 765   // piece is a wide-string.  If any of the string portions is a wide-string
 766   // literal, the result is a wide-string literal [C99 6.4.5p4].
 767   MaxTokenLength = StringToks[0].getLength();
 768   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
 769   AnyWide = StringToks[0].is(tok::wide_string_literal);
 770
 771   hadError = false;
 772
 773   // Implement Translation Phase #6: concatenation of string literals
 774   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
 775   for (unsigned i = 1; i != NumStringToks; ++i) {
 776     // The string could be shorter than this if it needs cleaning, but this is a
 777     // reasonable bound, which is all we need.
 778     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
 779
 780     // Remember maximum string piece length.
 781     if (StringToks[i].getLength() > MaxTokenLength)
 782       MaxTokenLength = StringToks[i].getLength();
 783
 784     // Remember if we see any wide strings.
 785     AnyWide |= StringToks[i].is(tok::wide_string_literal);
 786   }
 787
 788   // Include space for the null terminator.
 789   ++SizeBound;
 790
 791   // TODO: K&R warning: "traditional C rejects string constant concatenation"
 792
 793   // Get the width in bytes of wchar_t.  If no wchar_t strings are used, do not
 794   // query the target.  As such, wchar_tByteWidth is only valid if AnyWide=true.
 795   wchar_tByteWidth = ~0U;
 796   if (AnyWide) {
 797     wchar_tByteWidth = PP.getTargetInfo().getWCharWidth();
 798     assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
 799     wchar_tByteWidth /= 8;
 800   }
 801
 802   // The output buffer size needs to be large enough to hold wide characters.
 803   // This is a worst-case assumption which basically corresponds to L"" "long".
 804   if (AnyWide)
 805     SizeBound *= wchar_tByteWidth;
 806
 807   // Size the temporary buffer to hold the result string data.
 808   ResultBuf.resize(SizeBound);
 809
 810   // Likewise, but for each string piece.
 811   llvm::SmallString<512> TokenBuf;
 812   TokenBuf.resize(MaxTokenLength);
 813
 814   // Loop over all the strings, getting their spelling, and expanding them to
 815   // wide strings as appropriate.
 816   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
 817
 818   Pascal = false;
 819
 820   for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
 821     const char *ThisTokBuf = &TokenBuf[0];
 822     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
 823     // that ThisTokBuf points to a buffer that is big enough for the whole token
 824     // and 'spelled' tokens can only shrink.
 825     bool StringInvalid = false;
 826     unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf,
 827                                          &StringInvalid);
 828     if (StringInvalid) {
 829       hadError = 1;
 830       continue;
 831     }
 832
 833     const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
 834
 835     // TODO: Input character set mapping support.
 836
 837     // Skip L marker for wide strings.
 838     bool ThisIsWide = false;
 839     if (ThisTokBuf[0] == 'L') {
 840       ++ThisTokBuf;
 841       ThisIsWide = true;
 842     }
 843
 844     assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
 845     ++ThisTokBuf;
 846
 847     // Check if this is a pascal string
 848     if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
 849         ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
 850
 851       // If the \p sequence is found in the first token, we have a pascal string
 852       // Otherwise, if we already have a pascal string, ignore the first \p
 853       if (i == 0) {
 854         ++ThisTokBuf;
 855         Pascal = true;
 856       } else if (Pascal)
 857         ThisTokBuf += 2;
 858     }
 859
 860     while (ThisTokBuf != ThisTokEnd) {
 861       // Is this a span of non-escape characters?
 862       if (ThisTokBuf[0] != '\\') {
 863         const char *InStart = ThisTokBuf;
 864         do {
 865           ++ThisTokBuf;
 866         } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
 867
 868         // Copy the character span over.
 869         unsigned Len = ThisTokBuf-InStart;
 870         if (!AnyWide) {
 871           memcpy(ResultPtr, InStart, Len);
 872           ResultPtr += Len;
 873         } else {
 874           // Note: our internal rep of wide char tokens is always little-endian.
 875           for (; Len; --Len, ++InStart) {
 876             *ResultPtr++ = InStart[0];
 877             // Add zeros at the end.
 878             for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
 879               *ResultPtr++ = 0;
 880           }
 881         }
 882         continue;
 883       }
 884       // Is this a Universal Character Name escape?
 885       if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
 886         ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
 887                          hadError, StringToks[i].getLocation(), ThisIsWide, PP,
 888                          Complain);
 889         continue;
 890       }
 891       // Otherwise, this is a non-UCN escape character.  Process it.
 892       unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
 893                                               StringToks[i].getLocation(),
 894                                               ThisIsWide, PP, Complain);
 895
 896       // Note: our internal rep of wide char tokens is always little-endian.
 897       *ResultPtr++ = ResultChar & 0xFF;
 898
 899       if (AnyWide) {
 900         for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
 901           *ResultPtr++ = ResultChar >> i*8;
 902       }
 903     }
 904   }
 905
 906   if (Pascal) {
 907     ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
 908
 909     // Verify that pascal strings aren't too large.
 910     if (GetStringLength() > 256 && Complain) {
 911       PP.Diag(StringToks[0].getLocation(), diag::err_pascal_string_too_long)
 912         << SourceRange(StringToks[0].getLocation(),
 913                        StringToks[NumStringToks-1].getLocation());
 914       hadError = 1;
 915       return;
 916     }
 917   }
 918 }
 919
 920
 921 /// getOffsetOfStringByte - This function returns the offset of the
 922 /// specified byte of the string data represented by Token.  This handles
 923 /// advancing over escape sequences in the string.
 924 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
 925                                                     unsigned ByteNo,
 926                                                     Preprocessor &PP,
 927                                                     bool Complain) {
 928   // Get the spelling of the token.
 929   llvm::SmallString<16> SpellingBuffer;
 930   SpellingBuffer.resize(Tok.getLength());
 931
 932   bool StringInvalid = false;
 933   const char *SpellingPtr = &SpellingBuffer[0];
 934   unsigned TokLen = PP.getSpelling(Tok, SpellingPtr, &StringInvalid);
 935   if (StringInvalid) {
 936     return 0;
 937   }
 938
 939   assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet");
 940
 941
 942   const char *SpellingStart = SpellingPtr;
 943   const char *SpellingEnd = SpellingPtr+TokLen;
 944
 945   // Skip over the leading quote.
 946   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
 947   ++SpellingPtr;
 948
 949   // Skip over bytes until we find the offset we're looking for.
 950   while (ByteNo) {
 951     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
 952
 953     // Step over non-escapes simply.
 954     if (*SpellingPtr != '\\') {
 955       ++SpellingPtr;
 956       --ByteNo;
 957       continue;
 958     }
 959
 960     // Otherwise, this is an escape character.  Advance over it.
 961     bool HadError = false;
 962     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
 963                       Tok.getLocation(), false, PP, Complain);
 964     assert(!HadError && "This method isn't valid on erroneous strings");
 965     --ByteNo;
 966   }
 967
 968   return SpellingPtr-SpellingStart;
 969 }