contrib/llvm/lib/Support/StringRef.cpp

   1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9
  10 #include "llvm/ADT/StringRef.h"
  11 #include "llvm/ADT/APInt.h"
  12 #include "llvm/ADT/Hashing.h"
  13 #include "llvm/ADT/edit_distance.h"
  14 #include <bitset>
  15
  16 using namespace llvm;
  17
  18 // MSVC emits references to this into the translation units which reference it.
  19 #ifndef _MSC_VER
  20 const size_t StringRef::npos;
  21 #endif
  22
  23 static char ascii_tolower(char x) {
  24   if (x >= 'A' && x <= 'Z')
  25     return x - 'A' + 'a';
  26   return x;
  27 }
  28
  29 static char ascii_toupper(char x) {
  30   if (x >= 'a' && x <= 'z')
  31     return x - 'a' + 'A';
  32   return x;
  33 }
  34
  35 static bool ascii_isdigit(char x) {
  36   return x >= '0' && x <= '9';
  37 }
  38
  39 // strncasecmp() is not available on non-POSIX systems, so define an
  40 // alternative function here.
  41 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
  42   for (size_t I = 0; I < Length; ++I) {
  43     unsigned char LHC = ascii_tolower(LHS[I]);
  44     unsigned char RHC = ascii_tolower(RHS[I]);
  45     if (LHC != RHC)
  46       return LHC < RHC ? -1 : 1;
  47   }
  48   return 0;
  49 }
  50
  51 /// compare_lower - Compare strings, ignoring case.
  52 int StringRef::compare_lower(StringRef RHS) const {
  53   if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
  54     return Res;
  55   if (Length == RHS.Length)
  56     return 0;
  57   return Length < RHS.Length ? -1 : 1;
  58 }
  59
  60 /// Check if this string starts with the given \p Prefix, ignoring case.
  61 bool StringRef::startswith_lower(StringRef Prefix) const {
  62   return Length >= Prefix.Length &&
  63       ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
  64 }
  65
  66 /// Check if this string ends with the given \p Suffix, ignoring case.
  67 bool StringRef::endswith_lower(StringRef Suffix) const {
  68   return Length >= Suffix.Length &&
  69       ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
  70 }
  71
  72 size_t StringRef::find_lower(char C, size_t From) const {
  73   char L = ascii_tolower(C);
  74   return find_if([L](char D) { return ascii_tolower(D) == L; }, From);
  75 }
  76
  77 /// compare_numeric - Compare strings, handle embedded numbers.
  78 int StringRef::compare_numeric(StringRef RHS) const {
  79   for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
  80     // Check for sequences of digits.
  81     if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
  82       // The longer sequence of numbers is considered larger.
  83       // This doesn't really handle prefixed zeros well.
  84       size_t J;
  85       for (J = I + 1; J != E + 1; ++J) {
  86         bool ld = J < Length && ascii_isdigit(Data[J]);
  87         bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
  88         if (ld != rd)
  89           return rd ? -1 : 1;
  90         if (!rd)
  91           break;
  92       }
  93       // The two number sequences have the same length (J-I), just memcmp them.
  94       if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
  95         return Res < 0 ? -1 : 1;
  96       // Identical number sequences, continue search after the numbers.
  97       I = J - 1;
  98       continue;
  99     }
 100     if (Data[I] != RHS.Data[I])
 101       return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
 102   }
 103   if (Length == RHS.Length)
 104     return 0;
 105   return Length < RHS.Length ? -1 : 1;
 106 }
 107
 108 // Compute the edit distance between the two given strings.
 109 unsigned StringRef::edit_distance(llvm::StringRef Other,
 110                                   bool AllowReplacements,
 111                                   unsigned MaxEditDistance) const {
 112   return llvm::ComputeEditDistance(
 113       makeArrayRef(data(), size()),
 114       makeArrayRef(Other.data(), Other.size()),
 115       AllowReplacements, MaxEditDistance);
 116 }
 117
 118 //===----------------------------------------------------------------------===//
 119 // String Operations
 120 //===----------------------------------------------------------------------===//
 121
 122 std::string StringRef::lower() const {
 123   std::string Result(size(), char());
 124   for (size_type i = 0, e = size(); i != e; ++i) {
 125     Result[i] = ascii_tolower(Data[i]);
 126   }
 127   return Result;
 128 }
 129
 130 std::string StringRef::upper() const {
 131   std::string Result(size(), char());
 132   for (size_type i = 0, e = size(); i != e; ++i) {
 133     Result[i] = ascii_toupper(Data[i]);
 134   }
 135   return Result;
 136 }
 137
 138 //===----------------------------------------------------------------------===//
 139 // String Searching
 140 //===----------------------------------------------------------------------===//
 141
 142
 143 /// find - Search for the first string \arg Str in the string.
 144 ///
 145 /// \return - The index of the first occurrence of \arg Str, or npos if not
 146 /// found.
 147 size_t StringRef::find(StringRef Str, size_t From) const {
 148   if (From > Length)
 149     return npos;
 150
 151   const char *Start = Data + From;
 152   size_t Size = Length - From;
 153
 154   const char *Needle = Str.data();
 155   size_t N = Str.size();
 156   if (N == 0)
 157     return From;
 158   if (Size < N)
 159     return npos;
 160   if (N == 1) {
 161     const char *Ptr = (const char *)::memchr(Start, Needle[0], Size);
 162     return Ptr == nullptr ? npos : Ptr - Data;
 163   }
 164
 165   const char *Stop = Start + (Size - N + 1);
 166
 167   // For short haystacks or unsupported needles fall back to the naive algorithm
 168   if (Size < 16 || N > 255) {
 169     do {
 170       if (std::memcmp(Start, Needle, N) == 0)
 171         return Start - Data;
 172       ++Start;
 173     } while (Start < Stop);
 174     return npos;
 175   }
 176
 177   // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
 178   uint8_t BadCharSkip[256];
 179   std::memset(BadCharSkip, N, 256);
 180   for (unsigned i = 0; i != N-1; ++i)
 181     BadCharSkip[(uint8_t)Str[i]] = N-1-i;
 182
 183   do {
 184     uint8_t Last = Start[N - 1];
 185     if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1]))
 186       if (std::memcmp(Start, Needle, N - 1) == 0)
 187         return Start - Data;
 188
 189     // Otherwise skip the appropriate number of bytes.
 190     Start += BadCharSkip[Last];
 191   } while (Start < Stop);
 192
 193   return npos;
 194 }
 195
 196 size_t StringRef::find_lower(StringRef Str, size_t From) const {
 197   StringRef This = substr(From);
 198   while (This.size() >= Str.size()) {
 199     if (This.startswith_lower(Str))
 200       return From;
 201     This = This.drop_front();
 202     ++From;
 203   }
 204   return npos;
 205 }
 206
 207 size_t StringRef::rfind_lower(char C, size_t From) const {
 208   From = std::min(From, Length);
 209   size_t i = From;
 210   while (i != 0) {
 211     --i;
 212     if (ascii_tolower(Data[i]) == ascii_tolower(C))
 213       return i;
 214   }
 215   return npos;
 216 }
 217
 218 /// rfind - Search for the last string \arg Str in the string.
 219 ///
 220 /// \return - The index of the last occurrence of \arg Str, or npos if not
 221 /// found.
 222 size_t StringRef::rfind(StringRef Str) const {
 223   size_t N = Str.size();
 224   if (N > Length)
 225     return npos;
 226   for (size_t i = Length - N + 1, e = 0; i != e;) {
 227     --i;
 228     if (substr(i, N).equals(Str))
 229       return i;
 230   }
 231   return npos;
 232 }
 233
 234 size_t StringRef::rfind_lower(StringRef Str) const {
 235   size_t N = Str.size();
 236   if (N > Length)
 237     return npos;
 238   for (size_t i = Length - N + 1, e = 0; i != e;) {
 239     --i;
 240     if (substr(i, N).equals_lower(Str))
 241       return i;
 242   }
 243   return npos;
 244 }
 245
 246 /// find_first_of - Find the first character in the string that is in \arg
 247 /// Chars, or npos if not found.
 248 ///
 249 /// Note: O(size() + Chars.size())
 250 StringRef::size_type StringRef::find_first_of(StringRef Chars,
 251                                               size_t From) const {
 252   std::bitset<1 << CHAR_BIT> CharBits;
 253   for (size_type i = 0; i != Chars.size(); ++i)
 254     CharBits.set((unsigned char)Chars[i]);
 255
 256   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
 257     if (CharBits.test((unsigned char)Data[i]))
 258       return i;
 259   return npos;
 260 }
 261
 262 /// find_first_not_of - Find the first character in the string that is not
 263 /// \arg C or npos if not found.
 264 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
 265   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
 266     if (Data[i] != C)
 267       return i;
 268   return npos;
 269 }
 270
 271 /// find_first_not_of - Find the first character in the string that is not
 272 /// in the string \arg Chars, or npos if not found.
 273 ///
 274 /// Note: O(size() + Chars.size())
 275 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
 276                                                   size_t From) const {
 277   std::bitset<1 << CHAR_BIT> CharBits;
 278   for (size_type i = 0; i != Chars.size(); ++i)
 279     CharBits.set((unsigned char)Chars[i]);
 280
 281   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
 282     if (!CharBits.test((unsigned char)Data[i]))
 283       return i;
 284   return npos;
 285 }
 286
 287 /// find_last_of - Find the last character in the string that is in \arg C,
 288 /// or npos if not found.
 289 ///
 290 /// Note: O(size() + Chars.size())
 291 StringRef::size_type StringRef::find_last_of(StringRef Chars,
 292                                              size_t From) const {
 293   std::bitset<1 << CHAR_BIT> CharBits;
 294   for (size_type i = 0; i != Chars.size(); ++i)
 295     CharBits.set((unsigned char)Chars[i]);
 296
 297   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
 298     if (CharBits.test((unsigned char)Data[i]))
 299       return i;
 300   return npos;
 301 }
 302
 303 /// find_last_not_of - Find the last character in the string that is not
 304 /// \arg C, or npos if not found.
 305 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
 306   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
 307     if (Data[i] != C)
 308       return i;
 309   return npos;
 310 }
 311
 312 /// find_last_not_of - Find the last character in the string that is not in
 313 /// \arg Chars, or npos if not found.
 314 ///
 315 /// Note: O(size() + Chars.size())
 316 StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
 317                                                  size_t From) const {
 318   std::bitset<1 << CHAR_BIT> CharBits;
 319   for (size_type i = 0, e = Chars.size(); i != e; ++i)
 320     CharBits.set((unsigned char)Chars[i]);
 321
 322   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
 323     if (!CharBits.test((unsigned char)Data[i]))
 324       return i;
 325   return npos;
 326 }
 327
 328 void StringRef::split(SmallVectorImpl<StringRef> &A,
 329                       StringRef Separator, int MaxSplit,
 330                       bool KeepEmpty) const {
 331   StringRef S = *this;
 332
 333   // Count down from MaxSplit. When MaxSplit is -1, this will just split
 334   // "forever". This doesn't support splitting more than 2^31 times
 335   // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
 336   // but that seems unlikely to be useful.
 337   while (MaxSplit-- != 0) {
 338     size_t Idx = S.find(Separator);
 339     if (Idx == npos)
 340       break;
 341
 342     // Push this split.
 343     if (KeepEmpty || Idx > 0)
 344       A.push_back(S.slice(0, Idx));
 345
 346     // Jump forward.
 347     S = S.slice(Idx + Separator.size(), npos);
 348   }
 349
 350   // Push the tail.
 351   if (KeepEmpty || !S.empty())
 352     A.push_back(S);
 353 }
 354
 355 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
 356                       int MaxSplit, bool KeepEmpty) const {
 357   StringRef S = *this;
 358
 359   // Count down from MaxSplit. When MaxSplit is -1, this will just split
 360   // "forever". This doesn't support splitting more than 2^31 times
 361   // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
 362   // but that seems unlikely to be useful.
 363   while (MaxSplit-- != 0) {
 364     size_t Idx = S.find(Separator);
 365     if (Idx == npos)
 366       break;
 367
 368     // Push this split.
 369     if (KeepEmpty || Idx > 0)
 370       A.push_back(S.slice(0, Idx));
 371
 372     // Jump forward.
 373     S = S.slice(Idx + 1, npos);
 374   }
 375
 376   // Push the tail.
 377   if (KeepEmpty || !S.empty())
 378     A.push_back(S);
 379 }
 380
 381 //===----------------------------------------------------------------------===//
 382 // Helpful Algorithms
 383 //===----------------------------------------------------------------------===//
 384
 385 /// count - Return the number of non-overlapped occurrences of \arg Str in
 386 /// the string.
 387 size_t StringRef::count(StringRef Str) const {
 388   size_t Count = 0;
 389   size_t N = Str.size();
 390   if (N > Length)
 391     return 0;
 392   for (size_t i = 0, e = Length - N + 1; i != e; ++i)
 393     if (substr(i, N).equals(Str))
 394       ++Count;
 395   return Count;
 396 }
 397
 398 static unsigned GetAutoSenseRadix(StringRef &Str) {
 399   if (Str.empty())
 400     return 10;
 401
 402   if (Str.startswith("0x") || Str.startswith("0X")) {
 403     Str = Str.substr(2);
 404     return 16;
 405   }
 406
 407   if (Str.startswith("0b") || Str.startswith("0B")) {
 408     Str = Str.substr(2);
 409     return 2;
 410   }
 411
 412   if (Str.startswith("0o")) {
 413     Str = Str.substr(2);
 414     return 8;
 415   }
 416
 417   if (Str[0] == '0' && Str.size() > 1 && ascii_isdigit(Str[1])) {
 418     Str = Str.substr(1);
 419     return 8;
 420   }
 421
 422   return 10;
 423 }
 424
 425 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
 426                                   unsigned long long &Result) {
 427   // Autosense radix if not specified.
 428   if (Radix == 0)
 429     Radix = GetAutoSenseRadix(Str);
 430
 431   // Empty strings (after the radix autosense) are invalid.
 432   if (Str.empty()) return true;
 433
 434   // Parse all the bytes of the string given this radix.  Watch for overflow.
 435   StringRef Str2 = Str;
 436   Result = 0;
 437   while (!Str2.empty()) {
 438     unsigned CharVal;
 439     if (Str2[0] >= '0' && Str2[0] <= '9')
 440       CharVal = Str2[0] - '0';
 441     else if (Str2[0] >= 'a' && Str2[0] <= 'z')
 442       CharVal = Str2[0] - 'a' + 10;
 443     else if (Str2[0] >= 'A' && Str2[0] <= 'Z')
 444       CharVal = Str2[0] - 'A' + 10;
 445     else
 446       break;
 447
 448     // If the parsed value is larger than the integer radix, we cannot
 449     // consume any more characters.
 450     if (CharVal >= Radix)
 451       break;
 452
 453     // Add in this character.
 454     unsigned long long PrevResult = Result;
 455     Result = Result * Radix + CharVal;
 456
 457     // Check for overflow by shifting back and seeing if bits were lost.
 458     if (Result / Radix < PrevResult)
 459       return true;
 460
 461     Str2 = Str2.substr(1);
 462   }
 463
 464   // We consider the operation a failure if no characters were consumed
 465   // successfully.
 466   if (Str.size() == Str2.size())
 467     return true;
 468
 469   Str = Str2;
 470   return false;
 471 }
 472
 473 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix,
 474                                 long long &Result) {
 475   unsigned long long ULLVal;
 476
 477   // Handle positive strings first.
 478   if (Str.empty() || Str.front() != '-') {
 479     if (consumeUnsignedInteger(Str, Radix, ULLVal) ||
 480         // Check for value so large it overflows a signed value.
 481         (long long)ULLVal < 0)
 482       return true;
 483     Result = ULLVal;
 484     return false;
 485   }
 486
 487   // Get the positive part of the value.
 488   StringRef Str2 = Str.drop_front(1);
 489   if (consumeUnsignedInteger(Str2, Radix, ULLVal) ||
 490       // Reject values so large they'd overflow as negative signed, but allow
 491       // "-0".  This negates the unsigned so that the negative isn't undefined
 492       // on signed overflow.
 493       (long long)-ULLVal > 0)
 494     return true;
 495
 496   Str = Str2;
 497   Result = -ULLVal;
 498   return false;
 499 }
 500
 501 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
 502 /// sequence of radix up to 36 to an unsigned long long value.
 503 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
 504                                 unsigned long long &Result) {
 505   if (consumeUnsignedInteger(Str, Radix, Result))
 506     return true;
 507
 508   // For getAsUnsignedInteger, we require the whole string to be consumed or
 509   // else we consider it a failure.
 510   return !Str.empty();
 511 }
 512
 513 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
 514                               long long &Result) {
 515   if (consumeSignedInteger(Str, Radix, Result))
 516     return true;
 517
 518   // For getAsSignedInteger, we require the whole string to be consumed or else
 519   // we consider it a failure.
 520   return !Str.empty();
 521 }
 522
 523 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
 524   StringRef Str = *this;
 525
 526   // Autosense radix if not specified.
 527   if (Radix == 0)
 528     Radix = GetAutoSenseRadix(Str);
 529
 530   assert(Radix > 1 && Radix <= 36);
 531
 532   // Empty strings (after the radix autosense) are invalid.
 533   if (Str.empty()) return true;
 534
 535   // Skip leading zeroes.  This can be a significant improvement if
 536   // it means we don't need > 64 bits.
 537   while (!Str.empty() && Str.front() == '0')
 538     Str = Str.substr(1);
 539
 540   // If it was nothing but zeroes....
 541   if (Str.empty()) {
 542     Result = APInt(64, 0);
 543     return false;
 544   }
 545
 546   // (Over-)estimate the required number of bits.
 547   unsigned Log2Radix = 0;
 548   while ((1U << Log2Radix) < Radix) Log2Radix++;
 549   bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
 550
 551   unsigned BitWidth = Log2Radix * Str.size();
 552   if (BitWidth < Result.getBitWidth())
 553     BitWidth = Result.getBitWidth(); // don't shrink the result
 554   else if (BitWidth > Result.getBitWidth())
 555     Result = Result.zext(BitWidth);
 556
 557   APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
 558   if (!IsPowerOf2Radix) {
 559     // These must have the same bit-width as Result.
 560     RadixAP = APInt(BitWidth, Radix);
 561     CharAP = APInt(BitWidth, 0);
 562   }
 563
 564   // Parse all the bytes of the string given this radix.
 565   Result = 0;
 566   while (!Str.empty()) {
 567     unsigned CharVal;
 568     if (Str[0] >= '0' && Str[0] <= '9')
 569       CharVal = Str[0]-'0';
 570     else if (Str[0] >= 'a' && Str[0] <= 'z')
 571       CharVal = Str[0]-'a'+10;
 572     else if (Str[0] >= 'A' && Str[0] <= 'Z')
 573       CharVal = Str[0]-'A'+10;
 574     else
 575       return true;
 576
 577     // If the parsed value is larger than the integer radix, the string is
 578     // invalid.
 579     if (CharVal >= Radix)
 580       return true;
 581
 582     // Add in this character.
 583     if (IsPowerOf2Radix) {
 584       Result <<= Log2Radix;
 585       Result |= CharVal;
 586     } else {
 587       Result *= RadixAP;
 588       CharAP = CharVal;
 589       Result += CharAP;
 590     }
 591
 592     Str = Str.substr(1);
 593   }
 594
 595   return false;
 596 }
 597
 598
 599 // Implementation of StringRef hashing.
 600 hash_code llvm::hash_value(StringRef S) {
 601   return hash_combine_range(S.begin(), S.end());
 602 }