contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp

   1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // This file implements the NumericLiteralParser, CharLiteralParser, and
  11 // StringLiteralParser interfaces.
  12 //
  13 //===----------------------------------------------------------------------===//
  14
  15 #include "clang/Lex/LiteralSupport.h"
  16 #include "clang/Lex/Preprocessor.h"
  17 #include "clang/Lex/LexDiagnostic.h"
  18 #include "clang/Basic/TargetInfo.h"
  19 #include "llvm/ADT/StringExtras.h"
  20 #include "llvm/Support/ErrorHandling.h"
  21 using namespace clang;
  22
  23 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
  24 /// not valid.
  25 static int HexDigitValue(char C) {
  26   if (C >= '0' && C <= '9') return C-'0';
  27   if (C >= 'a' && C <= 'f') return C-'a'+10;
  28   if (C >= 'A' && C <= 'F') return C-'A'+10;
  29   return -1;
  30 }
  31
  32 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
  33   switch (kind) {
  34   default: llvm_unreachable("Unknown token type!");
  35   case tok::char_constant:
  36   case tok::string_literal:
  37   case tok::utf8_string_literal:
  38     return Target.getCharWidth();
  39   case tok::wide_char_constant:
  40   case tok::wide_string_literal:
  41     return Target.getWCharWidth();
  42   case tok::utf16_char_constant:
  43   case tok::utf16_string_literal:
  44     return Target.getChar16Width();
  45   case tok::utf32_char_constant:
  46   case tok::utf32_string_literal:
  47     return Target.getChar32Width();
  48   }
  49 }
  50
  51 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
  52 /// either a character or a string literal.
  53 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
  54                                   const char *ThisTokEnd, bool &HadError,
  55                                   FullSourceLoc Loc, unsigned CharWidth,
  56                                   DiagnosticsEngine *Diags) {
  57   // Skip the '\' char.
  58   ++ThisTokBuf;
  59
  60   // We know that this character can't be off the end of the buffer, because
  61   // that would have been \", which would not have been the end of string.
  62   unsigned ResultChar = *ThisTokBuf++;
  63   switch (ResultChar) {
  64   // These map to themselves.
  65   case '\\': case '\'': case '"': case '?': break;
  66
  67     // These have fixed mappings.
  68   case 'a':
  69     // TODO: K&R: the meaning of '\\a' is different in traditional C
  70     ResultChar = 7;
  71     break;
  72   case 'b':
  73     ResultChar = 8;
  74     break;
  75   case 'e':
  76     if (Diags)
  77       Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
  78     ResultChar = 27;
  79     break;
  80   case 'E':
  81     if (Diags)
  82       Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
  83     ResultChar = 27;
  84     break;
  85   case 'f':
  86     ResultChar = 12;
  87     break;
  88   case 'n':
  89     ResultChar = 10;
  90     break;
  91   case 'r':
  92     ResultChar = 13;
  93     break;
  94   case 't':
  95     ResultChar = 9;
  96     break;
  97   case 'v':
  98     ResultChar = 11;
  99     break;
 100   case 'x': { // Hex escape.
 101     ResultChar = 0;
 102     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
 103       if (Diags)
 104         Diags->Report(Loc, diag::err_hex_escape_no_digits);
 105       HadError = 1;
 106       break;
 107     }
 108
 109     // Hex escapes are a maximal series of hex digits.
 110     bool Overflow = false;
 111     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
 112       int CharVal = HexDigitValue(ThisTokBuf[0]);
 113       if (CharVal == -1) break;
 114       // About to shift out a digit?
 115       Overflow |= (ResultChar & 0xF0000000) ? true : false;
 116       ResultChar <<= 4;
 117       ResultChar |= CharVal;
 118     }
 119
 120     // See if any bits will be truncated when evaluated as a character.
 121     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 122       Overflow = true;
 123       ResultChar &= ~0U >> (32-CharWidth);
 124     }
 125
 126     // Check for overflow.
 127     if (Overflow && Diags)   // Too many digits to fit in
 128       Diags->Report(Loc, diag::warn_hex_escape_too_large);
 129     break;
 130   }
 131   case '0': case '1': case '2': case '3':
 132   case '4': case '5': case '6': case '7': {
 133     // Octal escapes.
 134     --ThisTokBuf;
 135     ResultChar = 0;
 136
 137     // Octal escapes are a series of octal digits with maximum length 3.
 138     // "\0123" is a two digit sequence equal to "\012" "3".
 139     unsigned NumDigits = 0;
 140     do {
 141       ResultChar <<= 3;
 142       ResultChar |= *ThisTokBuf++ - '0';
 143       ++NumDigits;
 144     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
 145              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
 146
 147     // Check for overflow.  Reject '\777', but not L'\777'.
 148     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 149       if (Diags)
 150         Diags->Report(Loc, diag::warn_octal_escape_too_large);
 151       ResultChar &= ~0U >> (32-CharWidth);
 152     }
 153     break;
 154   }
 155
 156     // Otherwise, these are not valid escapes.
 157   case '(': case '{': case '[': case '%':
 158     // GCC accepts these as extensions.  We warn about them as such though.
 159     if (Diags)
 160       Diags->Report(Loc, diag::ext_nonstandard_escape)
 161         << std::string()+(char)ResultChar;
 162     break;
 163   default:
 164     if (Diags == 0)
 165       break;
 166
 167     if (isgraph(ResultChar))
 168       Diags->Report(Loc, diag::ext_unknown_escape)
 169         << std::string()+(char)ResultChar;
 170     else
 171       Diags->Report(Loc, diag::ext_unknown_escape)
 172         << "x"+llvm::utohexstr(ResultChar);
 173     break;
 174   }
 175
 176   return ResultChar;
 177 }
 178
 179 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
 180 /// return the UTF32.
 181 static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
 182                              uint32_t &UcnVal, unsigned short &UcnLen,
 183                              FullSourceLoc Loc, DiagnosticsEngine *Diags,
 184                              const LangOptions &Features) {
 185   if (!Features.CPlusPlus && !Features.C99 && Diags)
 186     Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
 187
 188   // Save the beginning of the string (for error diagnostics).
 189   const char *ThisTokBegin = ThisTokBuf;
 190
 191   // Skip the '\u' char's.
 192   ThisTokBuf += 2;
 193
 194   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
 195     if (Diags)
 196       Diags->Report(Loc, diag::err_ucn_escape_no_digits);
 197     return false;
 198   }
 199   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
 200   unsigned short UcnLenSave = UcnLen;
 201   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
 202     int CharVal = HexDigitValue(ThisTokBuf[0]);
 203     if (CharVal == -1) break;
 204     UcnVal <<= 4;
 205     UcnVal |= CharVal;
 206   }
 207   // If we didn't consume the proper number of digits, there is a problem.
 208   if (UcnLenSave) {
 209     if (Diags) {
 210       SourceLocation L =
 211         Lexer::AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin,
 212                                        Loc.getManager(), Features);
 213       Diags->Report(FullSourceLoc(L, Loc.getManager()),
 214                     diag::err_ucn_escape_incomplete);
 215     }
 216     return false;
 217   }
 218   // Check UCN constraints (C99 6.4.3p2).
 219   if ((UcnVal < 0xa0 &&
 220       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
 221       || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
 222       || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
 223     if (Diags)
 224       Diags->Report(Loc, diag::err_ucn_escape_invalid);
 225     return false;
 226   }
 227   return true;
 228 }
 229
 230 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
 231 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
 232 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
 233 /// we will likely rework our support for UCN's.
 234 static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
 235                             char *&ResultBuf, bool &HadError,
 236                             FullSourceLoc Loc, unsigned CharByteWidth,
 237                             DiagnosticsEngine *Diags,
 238                             const LangOptions &Features) {
 239   typedef uint32_t UTF32;
 240   UTF32 UcnVal = 0;
 241   unsigned short UcnLen = 0;
 242   if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags,
 243                         Features)) {
 244     HadError = 1;
 245     return;
 246   }
 247
 248   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
 249          "only character widths of 1, 2, or 4 bytes supported");
 250
 251   (void)UcnLen;
 252   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
 253
 254   if (CharByteWidth == 4) {
 255     // Note: our internal rep of wide char tokens is always little-endian.
 256     *ResultBuf++ = (UcnVal & 0x000000FF);
 257     *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
 258     *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
 259     *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
 260     return;
 261   }
 262
 263   if (CharByteWidth == 2) {
 264     // Convert to UTF16.
 265     if (UcnVal < (UTF32)0xFFFF) {
 266       *ResultBuf++ = (UcnVal & 0x000000FF);
 267       *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
 268       return;
 269     }
 270     if (Diags) Diags->Report(Loc, diag::warn_ucn_escape_too_large);
 271
 272     typedef uint16_t UTF16;
 273     UcnVal -= 0x10000;
 274     UTF16 surrogate1 = 0xD800 + (UcnVal >> 10);
 275     UTF16 surrogate2 = 0xDC00 + (UcnVal & 0x3FF);
 276     *ResultBuf++ = (surrogate1 & 0x000000FF);
 277     *ResultBuf++ = (surrogate1 & 0x0000FF00) >> 8;
 278     *ResultBuf++ = (surrogate2 & 0x000000FF);
 279     *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
 280     return;
 281   }
 282
 283   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
 284
 285   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
 286   // The conversion below was inspired by:
 287   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
 288   // First, we determine how many bytes the result will require.
 289   typedef uint8_t UTF8;
 290
 291   unsigned short bytesToWrite = 0;
 292   if (UcnVal < (UTF32)0x80)
 293     bytesToWrite = 1;
 294   else if (UcnVal < (UTF32)0x800)
 295     bytesToWrite = 2;
 296   else if (UcnVal < (UTF32)0x10000)
 297     bytesToWrite = 3;
 298   else
 299     bytesToWrite = 4;
 300
 301   const unsigned byteMask = 0xBF;
 302   const unsigned byteMark = 0x80;
 303
 304   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
 305   // into the first byte, depending on how many bytes follow.
 306   static const UTF8 firstByteMark[5] = {
 307     0x00, 0x00, 0xC0, 0xE0, 0xF0
 308   };
 309   // Finally, we write the bytes into ResultBuf.
 310   ResultBuf += bytesToWrite;
 311   switch (bytesToWrite) { // note: everything falls through.
 312     case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 313     case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 314     case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
 315     case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
 316   }
 317   // Update the buffer.
 318   ResultBuf += bytesToWrite;
 319 }
 320
 321
 322 ///       integer-constant: [C99 6.4.4.1]
 323 ///         decimal-constant integer-suffix
 324 ///         octal-constant integer-suffix
 325 ///         hexadecimal-constant integer-suffix
 326 ///       decimal-constant:
 327 ///         nonzero-digit
 328 ///         decimal-constant digit
 329 ///       octal-constant:
 330 ///         0
 331 ///         octal-constant octal-digit
 332 ///       hexadecimal-constant:
 333 ///         hexadecimal-prefix hexadecimal-digit
 334 ///         hexadecimal-constant hexadecimal-digit
 335 ///       hexadecimal-prefix: one of
 336 ///         0x 0X
 337 ///       integer-suffix:
 338 ///         unsigned-suffix [long-suffix]
 339 ///         unsigned-suffix [long-long-suffix]
 340 ///         long-suffix [unsigned-suffix]
 341 ///         long-long-suffix [unsigned-sufix]
 342 ///       nonzero-digit:
 343 ///         1 2 3 4 5 6 7 8 9
 344 ///       octal-digit:
 345 ///         0 1 2 3 4 5 6 7
 346 ///       hexadecimal-digit:
 347 ///         0 1 2 3 4 5 6 7 8 9
 348 ///         a b c d e f
 349 ///         A B C D E F
 350 ///       unsigned-suffix: one of
 351 ///         u U
 352 ///       long-suffix: one of
 353 ///         l L
 354 ///       long-long-suffix: one of
 355 ///         ll LL
 356 ///
 357 ///       floating-constant: [C99 6.4.4.2]
 358 ///         TODO: add rules...
 359 ///
 360 NumericLiteralParser::
 361 NumericLiteralParser(const char *begin, const char *end,
 362                      SourceLocation TokLoc, Preprocessor &pp)
 363   : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
 364
 365   // This routine assumes that the range begin/end matches the regex for integer
 366   // and FP constants (specifically, the 'pp-number' regex), and assumes that
 367   // the byte at "*end" is both valid and not part of the regex.  Because of
 368   // this, it doesn't have to check for 'overscan' in various places.
 369   assert(!isalnum(*end) && *end != '.' && *end != '_' &&
 370          "Lexer didn't maximally munch?");
 371
 372   s = DigitsBegin = begin;
 373   saw_exponent = false;
 374   saw_period = false;
 375   isLong = false;
 376   isUnsigned = false;
 377   isLongLong = false;
 378   isFloat = false;
 379   isImaginary = false;
 380   isMicrosoftInteger = false;
 381   hadError = false;
 382
 383   if (*s == '0') { // parse radix
 384     ParseNumberStartingWithZero(TokLoc);
 385     if (hadError)
 386       return;
 387   } else { // the first digit is non-zero
 388     radix = 10;
 389     s = SkipDigits(s);
 390     if (s == ThisTokEnd) {
 391       // Done.
 392     } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
 393       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 394               diag::err_invalid_decimal_digit) << StringRef(s, 1);
 395       hadError = true;
 396       return;
 397     } else if (*s == '.') {
 398       s++;
 399       saw_period = true;
 400       s = SkipDigits(s);
 401     }
 402     if ((*s == 'e' || *s == 'E')) { // exponent
 403       const char *Exponent = s;
 404       s++;
 405       saw_exponent = true;
 406       if (*s == '+' || *s == '-')  s++; // sign
 407       const char *first_non_digit = SkipDigits(s);
 408       if (first_non_digit != s) {
 409         s = first_non_digit;
 410       } else {
 411         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
 412                 diag::err_exponent_has_no_digits);
 413         hadError = true;
 414         return;
 415       }
 416     }
 417   }
 418
 419   SuffixBegin = s;
 420
 421   // Parse the suffix.  At this point we can classify whether we have an FP or
 422   // integer constant.
 423   bool isFPConstant = isFloatingLiteral();
 424
 425   // Loop over all of the characters of the suffix.  If we see something bad,
 426   // we break out of the loop.
 427   for (; s != ThisTokEnd; ++s) {
 428     switch (*s) {
 429     case 'f':      // FP Suffix for "float"
 430     case 'F':
 431       if (!isFPConstant) break;  // Error for integer constant.
 432       if (isFloat || isLong) break; // FF, LF invalid.
 433       isFloat = true;
 434       continue;  // Success.
 435     case 'u':
 436     case 'U':
 437       if (isFPConstant) break;  // Error for floating constant.
 438       if (isUnsigned) break;    // Cannot be repeated.
 439       isUnsigned = true;
 440       continue;  // Success.
 441     case 'l':
 442     case 'L':
 443       if (isLong || isLongLong) break;  // Cannot be repeated.
 444       if (isFloat) break;               // LF invalid.
 445
 446       // Check for long long.  The L's need to be adjacent and the same case.
 447       if (s+1 != ThisTokEnd && s[1] == s[0]) {
 448         if (isFPConstant) break;        // long long invalid for floats.
 449         isLongLong = true;
 450         ++s;  // Eat both of them.
 451       } else {
 452         isLong = true;
 453       }
 454       continue;  // Success.
 455     case 'i':
 456     case 'I':
 457       if (PP.getLangOptions().MicrosoftExt) {
 458         if (isFPConstant || isLong || isLongLong) break;
 459
 460         // Allow i8, i16, i32, i64, and i128.
 461         if (s + 1 != ThisTokEnd) {
 462           switch (s[1]) {
 463             case '8':
 464               s += 2; // i8 suffix
 465               isMicrosoftInteger = true;
 466               break;
 467             case '1':
 468               if (s + 2 == ThisTokEnd) break;
 469               if (s[2] == '6') {
 470                 s += 3; // i16 suffix
 471                 isMicrosoftInteger = true;
 472               }
 473               else if (s[2] == '2') {
 474                 if (s + 3 == ThisTokEnd) break;
 475                 if (s[3] == '8') {
 476                   s += 4; // i128 suffix
 477                   isMicrosoftInteger = true;
 478                 }
 479               }
 480               break;
 481             case '3':
 482               if (s + 2 == ThisTokEnd) break;
 483               if (s[2] == '2') {
 484                 s += 3; // i32 suffix
 485                 isLong = true;
 486                 isMicrosoftInteger = true;
 487               }
 488               break;
 489             case '6':
 490               if (s + 2 == ThisTokEnd) break;
 491               if (s[2] == '4') {
 492                 s += 3; // i64 suffix
 493                 isLongLong = true;
 494                 isMicrosoftInteger = true;
 495               }
 496               break;
 497             default:
 498               break;
 499           }
 500           break;
 501         }
 502       }
 503       // fall through.
 504     case 'j':
 505     case 'J':
 506       if (isImaginary) break;   // Cannot be repeated.
 507       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 508               diag::ext_imaginary_constant);
 509       isImaginary = true;
 510       continue;  // Success.
 511     }
 512     // If we reached here, there was an error.
 513     break;
 514   }
 515
 516   // Report an error if there are any.
 517   if (s != ThisTokEnd) {
 518     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
 519             isFPConstant ? diag::err_invalid_suffix_float_constant :
 520                            diag::err_invalid_suffix_integer_constant)
 521       << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
 522     hadError = true;
 523     return;
 524   }
 525 }
 526
 527 /// ParseNumberStartingWithZero - This method is called when the first character
 528 /// of the number is found to be a zero.  This means it is either an octal
 529 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
 530 /// a floating point number (01239.123e4).  Eat the prefix, determining the
 531 /// radix etc.
 532 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
 533   assert(s[0] == '0' && "Invalid method call");
 534   s++;
 535
 536   // Handle a hex number like 0x1234.
 537   if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
 538     s++;
 539     radix = 16;
 540     DigitsBegin = s;
 541     s = SkipHexDigits(s);
 542     if (s == ThisTokEnd) {
 543       // Done.
 544     } else if (*s == '.') {
 545       s++;
 546       saw_period = true;
 547       s = SkipHexDigits(s);
 548     }
 549     // A binary exponent can appear with or with a '.'. If dotted, the
 550     // binary exponent is required.
 551     if (*s == 'p' || *s == 'P') {
 552       const char *Exponent = s;
 553       s++;
 554       saw_exponent = true;
 555       if (*s == '+' || *s == '-')  s++; // sign
 556       const char *first_non_digit = SkipDigits(s);
 557       if (first_non_digit == s) {
 558         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
 559                 diag::err_exponent_has_no_digits);
 560         hadError = true;
 561         return;
 562       }
 563       s = first_non_digit;
 564
 565       if (!PP.getLangOptions().HexFloats)
 566         PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
 567     } else if (saw_period) {
 568       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 569               diag::err_hexconstant_requires_exponent);
 570       hadError = true;
 571     }
 572     return;
 573   }
 574
 575   // Handle simple binary numbers 0b01010
 576   if (*s == 'b' || *s == 'B') {
 577     // 0b101010 is a GCC extension.
 578     PP.Diag(TokLoc, diag::ext_binary_literal);
 579     ++s;
 580     radix = 2;
 581     DigitsBegin = s;
 582     s = SkipBinaryDigits(s);
 583     if (s == ThisTokEnd) {
 584       // Done.
 585     } else if (isxdigit(*s)) {
 586       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 587               diag::err_invalid_binary_digit) << StringRef(s, 1);
 588       hadError = true;
 589     }
 590     // Other suffixes will be diagnosed by the caller.
 591     return;
 592   }
 593
 594   // For now, the radix is set to 8. If we discover that we have a
 595   // floating point constant, the radix will change to 10. Octal floating
 596   // point constants are not permitted (only decimal and hexadecimal).
 597   radix = 8;
 598   DigitsBegin = s;
 599   s = SkipOctalDigits(s);
 600   if (s == ThisTokEnd)
 601     return; // Done, simple octal number like 01234
 602
 603   // If we have some other non-octal digit that *is* a decimal digit, see if
 604   // this is part of a floating point number like 094.123 or 09e1.
 605   if (isdigit(*s)) {
 606     const char *EndDecimal = SkipDigits(s);
 607     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
 608       s = EndDecimal;
 609       radix = 10;
 610     }
 611   }
 612
 613   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
 614   // the code is using an incorrect base.
 615   if (isxdigit(*s) && *s != 'e' && *s != 'E') {
 616     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
 617             diag::err_invalid_octal_digit) << StringRef(s, 1);
 618     hadError = true;
 619     return;
 620   }
 621
 622   if (*s == '.') {
 623     s++;
 624     radix = 10;
 625     saw_period = true;
 626     s = SkipDigits(s); // Skip suffix.
 627   }
 628   if (*s == 'e' || *s == 'E') { // exponent
 629     const char *Exponent = s;
 630     s++;
 631     radix = 10;
 632     saw_exponent = true;
 633     if (*s == '+' || *s == '-')  s++; // sign
 634     const char *first_non_digit = SkipDigits(s);
 635     if (first_non_digit != s) {
 636       s = first_non_digit;
 637     } else {
 638       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
 639               diag::err_exponent_has_no_digits);
 640       hadError = true;
 641       return;
 642     }
 643   }
 644 }
 645
 646
 647 /// GetIntegerValue - Convert this numeric literal value to an APInt that
 648 /// matches Val's input width.  If there is an overflow, set Val to the low bits
 649 /// of the result and return true.  Otherwise, return false.
 650 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
 651   // Fast path: Compute a conservative bound on the maximum number of
 652   // bits per digit in this radix. If we can't possibly overflow a
 653   // uint64 based on that bound then do the simple conversion to
 654   // integer. This avoids the expensive overflow checking below, and
 655   // handles the common cases that matter (small decimal integers and
 656   // hex/octal values which don't overflow).
 657   unsigned MaxBitsPerDigit = 1;
 658   while ((1U << MaxBitsPerDigit) < radix)
 659     MaxBitsPerDigit += 1;
 660   if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
 661     uint64_t N = 0;
 662     for (s = DigitsBegin; s != SuffixBegin; ++s)
 663       N = N*radix + HexDigitValue(*s);
 664
 665     // This will truncate the value to Val's input width. Simply check
 666     // for overflow by comparing.
 667     Val = N;
 668     return Val.getZExtValue() != N;
 669   }
 670
 671   Val = 0;
 672   s = DigitsBegin;
 673
 674   llvm::APInt RadixVal(Val.getBitWidth(), radix);
 675   llvm::APInt CharVal(Val.getBitWidth(), 0);
 676   llvm::APInt OldVal = Val;
 677
 678   bool OverflowOccurred = false;
 679   while (s < SuffixBegin) {
 680     unsigned C = HexDigitValue(*s++);
 681
 682     // If this letter is out of bound for this radix, reject it.
 683     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
 684
 685     CharVal = C;
 686
 687     // Add the digit to the value in the appropriate radix.  If adding in digits
 688     // made the value smaller, then this overflowed.
 689     OldVal = Val;
 690
 691     // Multiply by radix, did overflow occur on the multiply?
 692     Val *= RadixVal;
 693     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
 694
 695     // Add value, did overflow occur on the value?
 696     //   (a + b) ult b  <=> overflow
 697     Val += CharVal;
 698     OverflowOccurred |= Val.ult(CharVal);
 699   }
 700   return OverflowOccurred;
 701 }
 702
 703 llvm::APFloat::opStatus
 704 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
 705   using llvm::APFloat;
 706
 707   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
 708   return Result.convertFromString(StringRef(ThisTokBegin, n),
 709                                   APFloat::rmNearestTiesToEven);
 710 }
 711
 712
 713 ///       character-literal: [C++0x lex.ccon]
 714 ///         ' c-char-sequence '
 715 ///         u' c-char-sequence '
 716 ///         U' c-char-sequence '
 717 ///         L' c-char-sequence '
 718 ///       c-char-sequence:
 719 ///         c-char
 720 ///         c-char-sequence c-char
 721 ///       c-char:
 722 ///         any member of the source character set except the single-quote ',
 723 ///           backslash \, or new-line character
 724 ///         escape-sequence
 725 ///         universal-character-name
 726 ///       escape-sequence: [C++0x lex.ccon]
 727 ///         simple-escape-sequence
 728 ///         octal-escape-sequence
 729 ///         hexadecimal-escape-sequence
 730 ///       simple-escape-sequence:
 731 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
 732 ///       octal-escape-sequence:
 733 ///         \ octal-digit
 734 ///         \ octal-digit octal-digit
 735 ///         \ octal-digit octal-digit octal-digit
 736 ///       hexadecimal-escape-sequence:
 737 ///         \x hexadecimal-digit
 738 ///         hexadecimal-escape-sequence hexadecimal-digit
 739 ///       universal-character-name:
 740 ///         \u hex-quad
 741 ///         \U hex-quad hex-quad
 742 ///       hex-quad:
 743 ///         hex-digit hex-digit hex-digit hex-digit
 744 ///
 745 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
 746                                      SourceLocation Loc, Preprocessor &PP,
 747                                      tok::TokenKind kind) {
 748   // At this point we know that the character matches the regex "L?'.*'".
 749   HadError = false;
 750
 751   Kind = kind;
 752
 753   // Determine if this is a wide or UTF character.
 754   if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
 755       Kind == tok::utf32_char_constant) {
 756     ++begin;
 757   }
 758
 759   // Skip over the entry quote.
 760   assert(begin[0] == '\'' && "Invalid token lexed");
 761   ++begin;
 762
 763   // FIXME: The "Value" is an uint64_t so we can handle char literals of
 764   // up to 64-bits.
 765   // FIXME: This extensively assumes that 'char' is 8-bits.
 766   assert(PP.getTargetInfo().getCharWidth() == 8 &&
 767          "Assumes char is 8 bits");
 768   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
 769          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
 770          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
 771   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
 772          "Assumes sizeof(wchar) on target is <= 64");
 773
 774   // This is what we will use for overflow detection
 775   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
 776
 777   unsigned NumCharsSoFar = 0;
 778   bool Warned = false;
 779   while (begin[0] != '\'') {
 780     uint64_t ResultChar;
 781
 782       // Is this a Universal Character Name escape?
 783     if (begin[0] != '\\')     // If this is a normal character, consume it.
 784       ResultChar = (unsigned char)*begin++;
 785     else {                    // Otherwise, this is an escape character.
 786       unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
 787       // Check for UCN.
 788       if (begin[1] == 'u' || begin[1] == 'U') {
 789         uint32_t utf32 = 0;
 790         unsigned short UcnLen = 0;
 791         if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
 792                               FullSourceLoc(Loc, PP.getSourceManager()),
 793                               &PP.getDiagnostics(), PP.getLangOptions())) {
 794           HadError = 1;
 795         }
 796         ResultChar = utf32;
 797         if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
 798           PP.Diag(Loc, diag::warn_ucn_escape_too_large);
 799           ResultChar &= ~0U >> (32-CharWidth);
 800         }
 801       } else {
 802         // Otherwise, this is a non-UCN escape character.  Process it.
 803         ResultChar = ProcessCharEscape(begin, end, HadError,
 804                                        FullSourceLoc(Loc,PP.getSourceManager()),
 805                                        CharWidth, &PP.getDiagnostics());
 806       }
 807     }
 808
 809     // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
 810     // implementation defined (C99 6.4.4.4p10).
 811     if (NumCharsSoFar) {
 812       if (!isAscii()) {
 813         // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
 814         LitVal = 0;
 815       } else {
 816         // Narrow character literals act as though their value is concatenated
 817         // in this implementation, but warn on overflow.
 818         if (LitVal.countLeadingZeros() < 8 && !Warned) {
 819           PP.Diag(Loc, diag::warn_char_constant_too_large);
 820           Warned = true;
 821         }
 822         LitVal <<= 8;
 823       }
 824     }
 825
 826     LitVal = LitVal + ResultChar;
 827     ++NumCharsSoFar;
 828   }
 829
 830   // If this is the second character being processed, do special handling.
 831   if (NumCharsSoFar > 1) {
 832     // Warn about discarding the top bits for multi-char wide-character
 833     // constants (L'abcd').
 834     if (!isAscii())
 835       PP.Diag(Loc, diag::warn_extraneous_char_constant);
 836     else if (NumCharsSoFar != 4)
 837       PP.Diag(Loc, diag::ext_multichar_character_literal);
 838     else
 839       PP.Diag(Loc, diag::ext_four_char_character_literal);
 840     IsMultiChar = true;
 841   } else
 842     IsMultiChar = false;
 843
 844   // Transfer the value from APInt to uint64_t
 845   Value = LitVal.getZExtValue();
 846
 847   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
 848   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
 849   // character constants are not sign extended in the this implementation:
 850   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
 851   if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
 852       PP.getLangOptions().CharIsSigned)
 853     Value = (signed char)Value;
 854 }
 855
 856
 857 ///       string-literal: [C++0x lex.string]
 858 ///         encoding-prefix " [s-char-sequence] "
 859 ///         encoding-prefix R raw-string
 860 ///       encoding-prefix:
 861 ///         u8
 862 ///         u
 863 ///         U
 864 ///         L
 865 ///       s-char-sequence:
 866 ///         s-char
 867 ///         s-char-sequence s-char
 868 ///       s-char:
 869 ///         any member of the source character set except the double-quote ",
 870 ///           backslash \, or new-line character
 871 ///         escape-sequence
 872 ///         universal-character-name
 873 ///       raw-string:
 874 ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
 875 ///       r-char-sequence:
 876 ///         r-char
 877 ///         r-char-sequence r-char
 878 ///       r-char:
 879 ///         any member of the source character set, except a right parenthesis )
 880 ///           followed by the initial d-char-sequence (which may be empty)
 881 ///           followed by a double quote ".
 882 ///       d-char-sequence:
 883 ///         d-char
 884 ///         d-char-sequence d-char
 885 ///       d-char:
 886 ///         any member of the basic source character set except:
 887 ///           space, the left parenthesis (, the right parenthesis ),
 888 ///           the backslash \, and the control characters representing horizontal
 889 ///           tab, vertical tab, form feed, and newline.
 890 ///       escape-sequence: [C++0x lex.ccon]
 891 ///         simple-escape-sequence
 892 ///         octal-escape-sequence
 893 ///         hexadecimal-escape-sequence
 894 ///       simple-escape-sequence:
 895 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
 896 ///       octal-escape-sequence:
 897 ///         \ octal-digit
 898 ///         \ octal-digit octal-digit
 899 ///         \ octal-digit octal-digit octal-digit
 900 ///       hexadecimal-escape-sequence:
 901 ///         \x hexadecimal-digit
 902 ///         hexadecimal-escape-sequence hexadecimal-digit
 903 ///       universal-character-name:
 904 ///         \u hex-quad
 905 ///         \U hex-quad hex-quad
 906 ///       hex-quad:
 907 ///         hex-digit hex-digit hex-digit hex-digit
 908 ///
 909 StringLiteralParser::
 910 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
 911                     Preprocessor &PP, bool Complain)
 912   : SM(PP.getSourceManager()), Features(PP.getLangOptions()),
 913     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
 914     MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
 915     ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
 916   init(StringToks, NumStringToks);
 917 }
 918
 919 void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
 920   // The literal token may have come from an invalid source location (e.g. due
 921   // to a PCH error), in which case the token length will be 0.
 922   if (NumStringToks == 0 || StringToks[0].getLength() < 2) {
 923     hadError = true;
 924     return;
 925   }
 926
 927   // Scan all of the string portions, remember the max individual token length,
 928   // computing a bound on the concatenated string length, and see whether any
 929   // piece is a wide-string.  If any of the string portions is a wide-string
 930   // literal, the result is a wide-string literal [C99 6.4.5p4].
 931   assert(NumStringToks && "expected at least one token");
 932   MaxTokenLength = StringToks[0].getLength();
 933   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
 934   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
 935   Kind = StringToks[0].getKind();
 936
 937   hadError = false;
 938
 939   // Implement Translation Phase #6: concatenation of string literals
 940   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
 941   for (unsigned i = 1; i != NumStringToks; ++i) {
 942     if (StringToks[i].getLength() < 2) {
 943       hadError = true;
 944       return;
 945     }
 946
 947     // The string could be shorter than this if it needs cleaning, but this is a
 948     // reasonable bound, which is all we need.
 949     assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
 950     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
 951
 952     // Remember maximum string piece length.
 953     if (StringToks[i].getLength() > MaxTokenLength)
 954       MaxTokenLength = StringToks[i].getLength();
 955
 956     // Remember if we see any wide or utf-8/16/32 strings.
 957     // Also check for illegal concatenations.
 958     if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
 959       if (isAscii()) {
 960         Kind = StringToks[i].getKind();
 961       } else {
 962         if (Diags)
 963           Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
 964                         diag::err_unsupported_string_concat);
 965         hadError = true;
 966       }
 967     }
 968   }
 969
 970   // Include space for the null terminator.
 971   ++SizeBound;
 972
 973   // TODO: K&R warning: "traditional C rejects string constant concatenation"
 974
 975   // Get the width in bytes of char/wchar_t/char16_t/char32_t
 976   CharByteWidth = getCharWidth(Kind, Target);
 977   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
 978   CharByteWidth /= 8;
 979
 980   // The output buffer size needs to be large enough to hold wide characters.
 981   // This is a worst-case assumption which basically corresponds to L"" "long".
 982   SizeBound *= CharByteWidth;
 983
 984   // Size the temporary buffer to hold the result string data.
 985   ResultBuf.resize(SizeBound);
 986
 987   // Likewise, but for each string piece.
 988   llvm::SmallString<512> TokenBuf;
 989   TokenBuf.resize(MaxTokenLength);
 990
 991   // Loop over all the strings, getting their spelling, and expanding them to
 992   // wide strings as appropriate.
 993   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
 994
 995   Pascal = false;
 996
 997   for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
 998     const char *ThisTokBuf = &TokenBuf[0];
 999     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1000     // that ThisTokBuf points to a buffer that is big enough for the whole token
1001     // and 'spelled' tokens can only shrink.
1002     bool StringInvalid = false;
1003     unsigned ThisTokLen =
1004       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1005                          &StringInvalid);
1006     if (StringInvalid) {
1007       hadError = true;
1008       continue;
1009     }
1010
1011     const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
1012     // TODO: Input character set mapping support.
1013
1014     // Skip marker for wide or unicode strings.
1015     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1016       ++ThisTokBuf;
1017       // Skip 8 of u8 marker for utf8 strings.
1018       if (ThisTokBuf[0] == '8')
1019         ++ThisTokBuf;
1020     }
1021
1022     // Check for raw string
1023     if (ThisTokBuf[0] == 'R') {
1024       ThisTokBuf += 2; // skip R"
1025
1026       const char *Prefix = ThisTokBuf;
1027       while (ThisTokBuf[0] != '(')
1028         ++ThisTokBuf;
1029       ++ThisTokBuf; // skip '('
1030
1031       // remove same number of characters from the end
1032       if (ThisTokEnd >= ThisTokBuf + (ThisTokBuf - Prefix))
1033         ThisTokEnd -= (ThisTokBuf - Prefix);
1034
1035       // Copy the string over
1036       CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf));
1037     } else {
1038       assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
1039       ++ThisTokBuf; // skip "
1040
1041       // Check if this is a pascal string
1042       if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1043           ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1044
1045         // If the \p sequence is found in the first token, we have a pascal string
1046         // Otherwise, if we already have a pascal string, ignore the first \p
1047         if (i == 0) {
1048           ++ThisTokBuf;
1049           Pascal = true;
1050         } else if (Pascal)
1051           ThisTokBuf += 2;
1052       }
1053
1054       while (ThisTokBuf != ThisTokEnd) {
1055         // Is this a span of non-escape characters?
1056         if (ThisTokBuf[0] != '\\') {
1057           const char *InStart = ThisTokBuf;
1058           do {
1059             ++ThisTokBuf;
1060           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1061
1062           // Copy the character span over.
1063           CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart));
1064           continue;
1065         }
1066         // Is this a Universal Character Name escape?
1067         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
1068           EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
1069                           hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
1070                           CharByteWidth, Diags, Features);
1071           continue;
1072         }
1073         // Otherwise, this is a non-UCN escape character.  Process it.
1074         unsigned ResultChar =
1075           ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
1076                             FullSourceLoc(StringToks[i].getLocation(), SM),
1077                             CharByteWidth*8, Diags);
1078
1079         // Note: our internal rep of wide char tokens is always little-endian.
1080         *ResultPtr++ = ResultChar & 0xFF;
1081
1082         for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
1083           *ResultPtr++ = ResultChar >> i*8;
1084       }
1085     }
1086   }
1087
1088   if (Pascal) {
1089     ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
1090     ResultBuf[0] /= CharByteWidth;
1091
1092     // Verify that pascal strings aren't too large.
1093     if (GetStringLength() > 256) {
1094       if (Diags)
1095         Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
1096                       diag::err_pascal_string_too_long)
1097           << SourceRange(StringToks[0].getLocation(),
1098                          StringToks[NumStringToks-1].getLocation());
1099       hadError = true;
1100       return;
1101     }
1102   } else if (Diags) {
1103     // Complain if this string literal has too many characters.
1104     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1105
1106     if (GetNumStringChars() > MaxChars)
1107       Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
1108                     diag::ext_string_too_long)
1109         << GetNumStringChars() << MaxChars
1110         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1111         << SourceRange(StringToks[0].getLocation(),
1112                        StringToks[NumStringToks-1].getLocation());
1113   }
1114 }
1115
1116
1117 /// copyStringFragment - This function copies from Start to End into ResultPtr.
1118 /// Performs widening for multi-byte characters.
1119 void StringLiteralParser::CopyStringFragment(StringRef Fragment) {
1120   // Copy the character span over.
1121   if (CharByteWidth == 1) {
1122     memcpy(ResultPtr, Fragment.data(), Fragment.size());
1123     ResultPtr += Fragment.size();
1124   } else {
1125     // Note: our internal rep of wide char tokens is always little-endian.
1126     for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) {
1127       *ResultPtr++ = *I;
1128       // Add zeros at the end.
1129       for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
1130         *ResultPtr++ = 0;
1131     }
1132   }
1133 }
1134
1135
1136 /// getOffsetOfStringByte - This function returns the offset of the
1137 /// specified byte of the string data represented by Token.  This handles
1138 /// advancing over escape sequences in the string.
1139 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1140                                                     unsigned ByteNo) const {
1141   // Get the spelling of the token.
1142   llvm::SmallString<32> SpellingBuffer;
1143   SpellingBuffer.resize(Tok.getLength());
1144
1145   bool StringInvalid = false;
1146   const char *SpellingPtr = &SpellingBuffer[0];
1147   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1148                                        &StringInvalid);
1149   if (StringInvalid)
1150     return 0;
1151
1152   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1153          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1154
1155
1156   const char *SpellingStart = SpellingPtr;
1157   const char *SpellingEnd = SpellingPtr+TokLen;
1158
1159   // Skip over the leading quote.
1160   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1161   ++SpellingPtr;
1162
1163   // Skip over bytes until we find the offset we're looking for.
1164   while (ByteNo) {
1165     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1166
1167     // Step over non-escapes simply.
1168     if (*SpellingPtr != '\\') {
1169       ++SpellingPtr;
1170       --ByteNo;
1171       continue;
1172     }
1173
1174     // Otherwise, this is an escape character.  Advance over it.
1175     bool HadError = false;
1176     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
1177                       FullSourceLoc(Tok.getLocation(), SM),
1178                       CharByteWidth*8, Diags);
1179     assert(!HadError && "This method isn't valid on erroneous strings");
1180     --ByteNo;
1181   }
1182
1183   return SpellingPtr-SpellingStart;
1184 }