1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file implements the NumericLiteralParser, CharLiteralParser, and
11 // StringLiteralParser interfaces.
13 //===----------------------------------------------------------------------===//
15 #include "clang/Lex/LiteralSupport.h"
16 #include "clang/Lex/Preprocessor.h"
17 #include "clang/Lex/LexDiagnostic.h"
18 #include "clang/Basic/TargetInfo.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/ADT/StringExtras.h"
21 using namespace clang;
23 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
25 static int HexDigitValue(char C) {
26 if (C >= '0' && C <= '9') return C-'0';
27 if (C >= 'a' && C <= 'f') return C-'a'+10;
28 if (C >= 'A' && C <= 'F') return C-'A'+10;
32 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
33 /// either a character or a string literal.
34 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
35 const char *ThisTokEnd, bool &HadError,
36 SourceLocation Loc, bool IsWide,
37 Preprocessor &PP, bool Complain) {
41 // We know that this character can't be off the end of the buffer, because
42 // that would have been \", which would not have been the end of string.
43 unsigned ResultChar = *ThisTokBuf++;
45 // These map to themselves.
46 case '\\': case '\'': case '"': case '?': break;
48 // These have fixed mappings.
50 // TODO: K&R: the meaning of '\\a' is different in traditional C
58 PP.Diag(Loc, diag::ext_nonstandard_escape) << "e";
63 PP.Diag(Loc, diag::ext_nonstandard_escape) << "E";
81 case 'x': { // Hex escape.
83 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
85 PP.Diag(Loc, diag::err_hex_escape_no_digits);
90 // Hex escapes are a maximal series of hex digits.
91 bool Overflow = false;
92 for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
93 int CharVal = HexDigitValue(ThisTokBuf[0]);
94 if (CharVal == -1) break;
95 // About to shift out a digit?
96 Overflow |= (ResultChar & 0xF0000000) ? true : false;
98 ResultChar |= CharVal;
101 // See if any bits will be truncated when evaluated as a character.
102 unsigned CharWidth = IsWide
103 ? PP.getTargetInfo().getWCharWidth()
104 : PP.getTargetInfo().getCharWidth();
106 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
108 ResultChar &= ~0U >> (32-CharWidth);
111 // Check for overflow.
112 if (Overflow && Complain) // Too many digits to fit in
113 PP.Diag(Loc, diag::warn_hex_escape_too_large);
116 case '0': case '1': case '2': case '3':
117 case '4': case '5': case '6': case '7': {
122 // Octal escapes are a series of octal digits with maximum length 3.
123 // "\0123" is a two digit sequence equal to "\012" "3".
124 unsigned NumDigits = 0;
127 ResultChar |= *ThisTokBuf++ - '0';
129 } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
130 ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
132 // Check for overflow. Reject '\777', but not L'\777'.
133 unsigned CharWidth = IsWide
134 ? PP.getTargetInfo().getWCharWidth()
135 : PP.getTargetInfo().getCharWidth();
137 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
139 PP.Diag(Loc, diag::warn_octal_escape_too_large);
140 ResultChar &= ~0U >> (32-CharWidth);
145 // Otherwise, these are not valid escapes.
146 case '(': case '{': case '[': case '%':
147 // GCC accepts these as extensions. We warn about them as such though.
149 PP.Diag(Loc, diag::ext_nonstandard_escape)
150 << std::string()+(char)ResultChar;
156 if (isgraph(ThisTokBuf[0]))
157 PP.Diag(Loc, diag::ext_unknown_escape) << std::string()+(char)ResultChar;
159 PP.Diag(Loc, diag::ext_unknown_escape) << "x"+llvm::utohexstr(ResultChar);
166 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
167 /// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
168 /// When we decide to implement UCN's for character constants and identifiers,
169 /// we will likely rework our support for UCN's.
170 static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
171 char *&ResultBuf, bool &HadError,
172 SourceLocation Loc, bool IsWide, Preprocessor &PP,
175 // FIXME: Add a warning - UCN's are only valid in C++ & C99.
176 // FIXME: Handle wide strings.
178 // Save the beginning of the string (for error diagnostics).
179 const char *ThisTokBegin = ThisTokBuf;
181 // Skip the '\u' char's.
184 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
186 PP.Diag(Loc, diag::err_ucn_escape_no_digits);
190 typedef uint32_t UTF32;
193 unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
194 for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
195 int CharVal = HexDigitValue(ThisTokBuf[0]);
196 if (CharVal == -1) break;
200 // If we didn't consume the proper number of digits, there is a problem.
203 PP.Diag(PP.AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin),
204 diag::err_ucn_escape_incomplete);
208 // Check UCN constraints (C99 6.4.3p2).
209 if ((UcnVal < 0xa0 &&
210 (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
211 || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
212 || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
214 PP.Diag(Loc, diag::err_ucn_escape_invalid);
218 // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
219 // The conversion below was inspired by:
220 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
221 // First, we determine how many bytes the result will require.
222 typedef uint8_t UTF8;
224 unsigned short bytesToWrite = 0;
225 if (UcnVal < (UTF32)0x80)
227 else if (UcnVal < (UTF32)0x800)
229 else if (UcnVal < (UTF32)0x10000)
234 const unsigned byteMask = 0xBF;
235 const unsigned byteMark = 0x80;
237 // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
238 // into the first byte, depending on how many bytes follow.
239 static const UTF8 firstByteMark[5] = {
240 0x00, 0x00, 0xC0, 0xE0, 0xF0
242 // Finally, we write the bytes into ResultBuf.
243 ResultBuf += bytesToWrite;
244 switch (bytesToWrite) { // note: everything falls through.
245 case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
246 case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
247 case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
248 case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
250 // Update the buffer.
251 ResultBuf += bytesToWrite;
255 /// integer-constant: [C99 6.4.4.1]
256 /// decimal-constant integer-suffix
257 /// octal-constant integer-suffix
258 /// hexadecimal-constant integer-suffix
259 /// decimal-constant:
261 /// decimal-constant digit
264 /// octal-constant octal-digit
265 /// hexadecimal-constant:
266 /// hexadecimal-prefix hexadecimal-digit
267 /// hexadecimal-constant hexadecimal-digit
268 /// hexadecimal-prefix: one of
271 /// unsigned-suffix [long-suffix]
272 /// unsigned-suffix [long-long-suffix]
273 /// long-suffix [unsigned-suffix]
274 /// long-long-suffix [unsigned-sufix]
276 /// 1 2 3 4 5 6 7 8 9
279 /// hexadecimal-digit:
280 /// 0 1 2 3 4 5 6 7 8 9
283 /// unsigned-suffix: one of
285 /// long-suffix: one of
287 /// long-long-suffix: one of
290 /// floating-constant: [C99 6.4.4.2]
291 /// TODO: add rules...
293 NumericLiteralParser::
294 NumericLiteralParser(const char *begin, const char *end,
295 SourceLocation TokLoc, Preprocessor &pp)
296 : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
298 // This routine assumes that the range begin/end matches the regex for integer
299 // and FP constants (specifically, the 'pp-number' regex), and assumes that
300 // the byte at "*end" is both valid and not part of the regex. Because of
301 // this, it doesn't have to check for 'overscan' in various places.
302 assert(!isalnum(*end) && *end != '.' && *end != '_' &&
303 "Lexer didn't maximally munch?");
305 s = DigitsBegin = begin;
306 saw_exponent = false;
313 isMicrosoftInteger = false;
316 if (*s == '0') { // parse radix
317 ParseNumberStartingWithZero(TokLoc);
320 } else { // the first digit is non-zero
323 if (s == ThisTokEnd) {
325 } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
326 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
327 diag::err_invalid_decimal_digit) << std::string(s, s+1);
330 } else if (*s == '.') {
335 if ((*s == 'e' || *s == 'E')) { // exponent
336 const char *Exponent = s;
339 if (*s == '+' || *s == '-') s++; // sign
340 const char *first_non_digit = SkipDigits(s);
341 if (first_non_digit != s) {
344 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
345 diag::err_exponent_has_no_digits);
354 // Parse the suffix. At this point we can classify whether we have an FP or
356 bool isFPConstant = isFloatingLiteral();
358 // Loop over all of the characters of the suffix. If we see something bad,
359 // we break out of the loop.
360 for (; s != ThisTokEnd; ++s) {
362 case 'f': // FP Suffix for "float"
364 if (!isFPConstant) break; // Error for integer constant.
365 if (isFloat || isLong) break; // FF, LF invalid.
367 continue; // Success.
370 if (isFPConstant) break; // Error for floating constant.
371 if (isUnsigned) break; // Cannot be repeated.
373 continue; // Success.
376 if (isLong || isLongLong) break; // Cannot be repeated.
377 if (isFloat) break; // LF invalid.
379 // Check for long long. The L's need to be adjacent and the same case.
380 if (s+1 != ThisTokEnd && s[1] == s[0]) {
381 if (isFPConstant) break; // long long invalid for floats.
383 ++s; // Eat both of them.
387 continue; // Success.
389 if (PP.getLangOptions().Microsoft) {
390 if (isFPConstant || isLong || isLongLong) break;
392 // Allow i8, i16, i32, i64, and i128.
393 if (s + 1 != ThisTokEnd) {
397 isMicrosoftInteger = true;
400 if (s + 2 == ThisTokEnd) break;
401 if (s[2] == '6') s += 3; // i16 suffix
402 else if (s[2] == '2') {
403 if (s + 3 == ThisTokEnd) break;
404 if (s[3] == '8') s += 4; // i128 suffix
406 isMicrosoftInteger = true;
409 if (s + 2 == ThisTokEnd) break;
410 if (s[2] == '2') s += 3; // i32 suffix
411 isMicrosoftInteger = true;
414 if (s + 2 == ThisTokEnd) break;
415 if (s[2] == '4') s += 3; // i64 suffix
416 isMicrosoftInteger = true;
428 if (isImaginary) break; // Cannot be repeated.
429 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
430 diag::ext_imaginary_constant);
432 continue; // Success.
434 // If we reached here, there was an error.
438 // Report an error if there are any.
439 if (s != ThisTokEnd) {
440 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
441 isFPConstant ? diag::err_invalid_suffix_float_constant :
442 diag::err_invalid_suffix_integer_constant)
443 << std::string(SuffixBegin, ThisTokEnd);
449 /// ParseNumberStartingWithZero - This method is called when the first character
450 /// of the number is found to be a zero. This means it is either an octal
451 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
452 /// a floating point number (01239.123e4). Eat the prefix, determining the
454 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
455 assert(s[0] == '0' && "Invalid method call");
458 // Handle a hex number like 0x1234.
459 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
463 s = SkipHexDigits(s);
464 if (s == ThisTokEnd) {
466 } else if (*s == '.') {
469 s = SkipHexDigits(s);
471 // A binary exponent can appear with or with a '.'. If dotted, the
472 // binary exponent is required.
473 if ((*s == 'p' || *s == 'P') && !PP.getLangOptions().CPlusPlus0x) {
474 const char *Exponent = s;
477 if (*s == '+' || *s == '-') s++; // sign
478 const char *first_non_digit = SkipDigits(s);
479 if (first_non_digit == s) {
480 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
481 diag::err_exponent_has_no_digits);
487 // In C++0x, we cannot support hexadecmial floating literals because
488 // they conflict with user-defined literals, so we warn in previous
489 // versions of C++ by default.
490 if (PP.getLangOptions().CPlusPlus)
491 PP.Diag(TokLoc, diag::ext_hexconstant_cplusplus);
492 else if (!PP.getLangOptions().HexFloats)
493 PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
494 } else if (saw_period) {
495 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
496 diag::err_hexconstant_requires_exponent);
502 // Handle simple binary numbers 0b01010
503 if (*s == 'b' || *s == 'B') {
504 // 0b101010 is a GCC extension.
505 PP.Diag(TokLoc, diag::ext_binary_literal);
509 s = SkipBinaryDigits(s);
510 if (s == ThisTokEnd) {
512 } else if (isxdigit(*s)) {
513 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
514 diag::err_invalid_binary_digit) << std::string(s, s+1);
517 // Other suffixes will be diagnosed by the caller.
521 // For now, the radix is set to 8. If we discover that we have a
522 // floating point constant, the radix will change to 10. Octal floating
523 // point constants are not permitted (only decimal and hexadecimal).
526 s = SkipOctalDigits(s);
528 return; // Done, simple octal number like 01234
530 // If we have some other non-octal digit that *is* a decimal digit, see if
531 // this is part of a floating point number like 094.123 or 09e1.
533 const char *EndDecimal = SkipDigits(s);
534 if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
540 // If we have a hex digit other than 'e' (which denotes a FP exponent) then
541 // the code is using an incorrect base.
542 if (isxdigit(*s) && *s != 'e' && *s != 'E') {
543 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
544 diag::err_invalid_octal_digit) << std::string(s, s+1);
553 s = SkipDigits(s); // Skip suffix.
555 if (*s == 'e' || *s == 'E') { // exponent
556 const char *Exponent = s;
560 if (*s == '+' || *s == '-') s++; // sign
561 const char *first_non_digit = SkipDigits(s);
562 if (first_non_digit != s) {
565 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
566 diag::err_exponent_has_no_digits);
574 /// GetIntegerValue - Convert this numeric literal value to an APInt that
575 /// matches Val's input width. If there is an overflow, set Val to the low bits
576 /// of the result and return true. Otherwise, return false.
577 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
578 // Fast path: Compute a conservative bound on the maximum number of
579 // bits per digit in this radix. If we can't possibly overflow a
580 // uint64 based on that bound then do the simple conversion to
581 // integer. This avoids the expensive overflow checking below, and
582 // handles the common cases that matter (small decimal integers and
583 // hex/octal values which don't overflow).
584 unsigned MaxBitsPerDigit = 1;
585 while ((1U << MaxBitsPerDigit) < radix)
586 MaxBitsPerDigit += 1;
587 if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
589 for (s = DigitsBegin; s != SuffixBegin; ++s)
590 N = N*radix + HexDigitValue(*s);
592 // This will truncate the value to Val's input width. Simply check
593 // for overflow by comparing.
595 return Val.getZExtValue() != N;
601 llvm::APInt RadixVal(Val.getBitWidth(), radix);
602 llvm::APInt CharVal(Val.getBitWidth(), 0);
603 llvm::APInt OldVal = Val;
605 bool OverflowOccurred = false;
606 while (s < SuffixBegin) {
607 unsigned C = HexDigitValue(*s++);
609 // If this letter is out of bound for this radix, reject it.
610 assert(C < radix && "NumericLiteralParser ctor should have rejected this");
614 // Add the digit to the value in the appropriate radix. If adding in digits
615 // made the value smaller, then this overflowed.
618 // Multiply by radix, did overflow occur on the multiply?
620 OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
622 // Add value, did overflow occur on the value?
623 // (a + b) ult b <=> overflow
625 OverflowOccurred |= Val.ult(CharVal);
627 return OverflowOccurred;
630 llvm::APFloat::opStatus
631 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
633 using llvm::StringRef;
635 unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
636 return Result.convertFromString(StringRef(ThisTokBegin, n),
637 APFloat::rmNearestTiesToEven);
641 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
642 SourceLocation Loc, Preprocessor &PP) {
643 // At this point we know that the character matches the regex "L?'.*'".
646 // Determine if this is a wide character.
647 IsWide = begin[0] == 'L';
650 // Skip over the entry quote.
651 assert(begin[0] == '\'' && "Invalid token lexed");
654 // FIXME: The "Value" is an uint64_t so we can handle char literals of
656 // FIXME: This extensively assumes that 'char' is 8-bits.
657 assert(PP.getTargetInfo().getCharWidth() == 8 &&
658 "Assumes char is 8 bits");
659 assert(PP.getTargetInfo().getIntWidth() <= 64 &&
660 (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
661 "Assumes sizeof(int) on target is <= 64 and a multiple of char");
662 assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
663 "Assumes sizeof(wchar) on target is <= 64");
665 // This is what we will use for overflow detection
666 llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
668 unsigned NumCharsSoFar = 0;
670 while (begin[0] != '\'') {
672 if (begin[0] != '\\') // If this is a normal character, consume it.
673 ResultChar = *begin++;
674 else // Otherwise, this is an escape character.
675 ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP,
678 // If this is a multi-character constant (e.g. 'abc'), handle it. These are
679 // implementation defined (C99 6.4.4.4p10).
682 // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
685 // Narrow character literals act as though their value is concatenated
686 // in this implementation, but warn on overflow.
687 if (LitVal.countLeadingZeros() < 8 && !Warned) {
688 PP.Diag(Loc, diag::warn_char_constant_too_large);
695 LitVal = LitVal + ResultChar;
699 // If this is the second character being processed, do special handling.
700 if (NumCharsSoFar > 1) {
701 // Warn about discarding the top bits for multi-char wide-character
702 // constants (L'abcd').
704 PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
705 else if (NumCharsSoFar != 4)
706 PP.Diag(Loc, diag::ext_multichar_character_literal);
708 PP.Diag(Loc, diag::ext_four_char_character_literal);
713 // Transfer the value from APInt to uint64_t
714 Value = LitVal.getZExtValue();
716 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
717 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
718 // character constants are not sign extended in the this implementation:
719 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
720 if (!IsWide && NumCharsSoFar == 1 && (Value & 128) &&
721 PP.getLangOptions().CharIsSigned)
722 Value = (signed char)Value;
726 /// string-literal: [C99 6.4.5]
727 /// " [s-char-sequence] "
728 /// L" [s-char-sequence] "
731 /// s-char-sequence s-char
733 /// any source character except the double quote ",
734 /// backslash \, or newline character
736 /// universal-character-name
737 /// escape-character: [C99 6.4.4.4]
739 /// universal-character-name
741 /// character-escape-code
742 /// octal-escape-code
744 /// character-escape-code: one of
747 /// octal-escape-code:
749 /// octal-digit octal-digit
750 /// octal-digit octal-digit octal-digit
753 /// hex-escape-code hex-digit
754 /// universal-character-name:
756 /// \U hex-quad hex-quad
758 /// hex-digit hex-digit hex-digit hex-digit
760 StringLiteralParser::
761 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
762 Preprocessor &pp, bool Complain) : PP(pp) {
763 // Scan all of the string portions, remember the max individual token length,
764 // computing a bound on the concatenated string length, and see whether any
765 // piece is a wide-string. If any of the string portions is a wide-string
766 // literal, the result is a wide-string literal [C99 6.4.5p4].
767 MaxTokenLength = StringToks[0].getLength();
768 SizeBound = StringToks[0].getLength()-2; // -2 for "".
769 AnyWide = StringToks[0].is(tok::wide_string_literal);
773 // Implement Translation Phase #6: concatenation of string literals
774 /// (C99 5.1.1.2p1). The common case is only one string fragment.
775 for (unsigned i = 1; i != NumStringToks; ++i) {
776 // The string could be shorter than this if it needs cleaning, but this is a
777 // reasonable bound, which is all we need.
778 SizeBound += StringToks[i].getLength()-2; // -2 for "".
780 // Remember maximum string piece length.
781 if (StringToks[i].getLength() > MaxTokenLength)
782 MaxTokenLength = StringToks[i].getLength();
784 // Remember if we see any wide strings.
785 AnyWide |= StringToks[i].is(tok::wide_string_literal);
788 // Include space for the null terminator.
791 // TODO: K&R warning: "traditional C rejects string constant concatenation"
793 // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not
794 // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true.
795 wchar_tByteWidth = ~0U;
797 wchar_tByteWidth = PP.getTargetInfo().getWCharWidth();
798 assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
799 wchar_tByteWidth /= 8;
802 // The output buffer size needs to be large enough to hold wide characters.
803 // This is a worst-case assumption which basically corresponds to L"" "long".
805 SizeBound *= wchar_tByteWidth;
807 // Size the temporary buffer to hold the result string data.
808 ResultBuf.resize(SizeBound);
810 // Likewise, but for each string piece.
811 llvm::SmallString<512> TokenBuf;
812 TokenBuf.resize(MaxTokenLength);
814 // Loop over all the strings, getting their spelling, and expanding them to
815 // wide strings as appropriate.
816 ResultPtr = &ResultBuf[0]; // Next byte to fill in.
820 for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
821 const char *ThisTokBuf = &TokenBuf[0];
822 // Get the spelling of the token, which eliminates trigraphs, etc. We know
823 // that ThisTokBuf points to a buffer that is big enough for the whole token
824 // and 'spelled' tokens can only shrink.
825 bool StringInvalid = false;
826 unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf,
833 const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
835 // TODO: Input character set mapping support.
837 // Skip L marker for wide strings.
838 bool ThisIsWide = false;
839 if (ThisTokBuf[0] == 'L') {
844 assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
847 // Check if this is a pascal string
848 if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
849 ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
851 // If the \p sequence is found in the first token, we have a pascal string
852 // Otherwise, if we already have a pascal string, ignore the first \p
860 while (ThisTokBuf != ThisTokEnd) {
861 // Is this a span of non-escape characters?
862 if (ThisTokBuf[0] != '\\') {
863 const char *InStart = ThisTokBuf;
866 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
868 // Copy the character span over.
869 unsigned Len = ThisTokBuf-InStart;
871 memcpy(ResultPtr, InStart, Len);
874 // Note: our internal rep of wide char tokens is always little-endian.
875 for (; Len; --Len, ++InStart) {
876 *ResultPtr++ = InStart[0];
877 // Add zeros at the end.
878 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
884 // Is this a Universal Character Name escape?
885 if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
886 ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
887 hadError, StringToks[i].getLocation(), ThisIsWide, PP,
891 // Otherwise, this is a non-UCN escape character. Process it.
892 unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
893 StringToks[i].getLocation(),
894 ThisIsWide, PP, Complain);
896 // Note: our internal rep of wide char tokens is always little-endian.
897 *ResultPtr++ = ResultChar & 0xFF;
900 for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
901 *ResultPtr++ = ResultChar >> i*8;
907 ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
909 // Verify that pascal strings aren't too large.
910 if (GetStringLength() > 256 && Complain) {
911 PP.Diag(StringToks[0].getLocation(), diag::err_pascal_string_too_long)
912 << SourceRange(StringToks[0].getLocation(),
913 StringToks[NumStringToks-1].getLocation());
921 /// getOffsetOfStringByte - This function returns the offset of the
922 /// specified byte of the string data represented by Token. This handles
923 /// advancing over escape sequences in the string.
924 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
928 // Get the spelling of the token.
929 llvm::SmallString<16> SpellingBuffer;
930 SpellingBuffer.resize(Tok.getLength());
932 bool StringInvalid = false;
933 const char *SpellingPtr = &SpellingBuffer[0];
934 unsigned TokLen = PP.getSpelling(Tok, SpellingPtr, &StringInvalid);
939 assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet");
942 const char *SpellingStart = SpellingPtr;
943 const char *SpellingEnd = SpellingPtr+TokLen;
945 // Skip over the leading quote.
946 assert(SpellingPtr[0] == '"' && "Should be a string literal!");
949 // Skip over bytes until we find the offset we're looking for.
951 assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
953 // Step over non-escapes simply.
954 if (*SpellingPtr != '\\') {
960 // Otherwise, this is an escape character. Advance over it.
961 bool HadError = false;
962 ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
963 Tok.getLocation(), false, PP, Complain);
964 assert(!HadError && "This method isn't valid on erroneous strings");
968 return SpellingPtr-SpellingStart;