1 //===--- CacheTokens.cpp - Caching of lexer tokens for PTH support --------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This provides a possible implementation of PTH support for Clang that is
11 // based on caching lexed tokens and identifiers.
13 //===----------------------------------------------------------------------===//
15 #include "clang/Basic/Diagnostic.h"
16 #include "clang/Basic/FileManager.h"
17 #include "clang/Basic/FileSystemStatCache.h"
18 #include "clang/Basic/IdentifierTable.h"
19 #include "clang/Basic/SourceManager.h"
20 #include "clang/Frontend/Utils.h"
21 #include "clang/Lex/Lexer.h"
22 #include "clang/Lex/PTHManager.h"
23 #include "clang/Lex/Preprocessor.h"
24 #include "llvm/ADT/StringExtras.h"
25 #include "llvm/ADT/StringMap.h"
26 #include "llvm/Support/EndianStream.h"
27 #include "llvm/Support/FileSystem.h"
28 #include "llvm/Support/MemoryBuffer.h"
29 #include "llvm/Support/OnDiskHashTable.h"
30 #include "llvm/Support/Path.h"
32 // FIXME: put this somewhere else?
34 #define S_ISDIR(x) (((x)&_S_IFDIR)!=0)
37 using namespace clang;
39 //===----------------------------------------------------------------------===//
40 // PTH-specific stuff.
41 //===----------------------------------------------------------------------===//
43 typedef uint32_t Offset;
47 Offset TokenData, PPCondData;
52 PTHEntry(Offset td, Offset ppcd)
53 : TokenData(td), PPCondData(ppcd) {}
55 Offset getTokenOffset() const { return TokenData; }
56 Offset getPPCondTableOffset() const { return PPCondData; }
60 class PTHEntryKeyVariant {
63 // FIXME: Use "StringRef Path;" when MSVC 2013 is dropped.
67 enum { IsFE = 0x1, IsDE = 0x2, IsNoExist = 0x0 } Kind;
71 PTHEntryKeyVariant(const FileEntry *fe) : FE(fe), Kind(IsFE), Data(nullptr) {}
73 PTHEntryKeyVariant(FileData *Data, StringRef Path)
74 : PathPtr(Path.data()), PathSize(Path.size()), Kind(IsDE),
75 Data(new FileData(*Data)) {}
77 explicit PTHEntryKeyVariant(StringRef Path)
78 : PathPtr(Path.data()), PathSize(Path.size()), Kind(IsNoExist),
81 bool isFile() const { return Kind == IsFE; }
83 StringRef getString() const {
84 return Kind == IsFE ? FE->getName() : StringRef(PathPtr, PathSize);
87 unsigned getKind() const { return (unsigned) Kind; }
89 void EmitData(raw_ostream& Out) {
90 using namespace llvm::support;
91 endian::Writer<little> LE(Out);
94 // Emit stat information.
95 llvm::sys::fs::UniqueID UID = FE->getUniqueID();
96 LE.write<uint64_t>(UID.getFile());
97 LE.write<uint64_t>(UID.getDevice());
98 LE.write<uint64_t>(FE->getModificationTime());
99 LE.write<uint64_t>(FE->getSize());
102 // Emit stat information.
103 LE.write<uint64_t>(Data->UniqueID.getFile());
104 LE.write<uint64_t>(Data->UniqueID.getDevice());
105 LE.write<uint64_t>(Data->ModTime);
106 LE.write<uint64_t>(Data->Size);
114 unsigned getRepresentationLength() const {
115 return Kind == IsNoExist ? 0 : 4 * 8;
119 class FileEntryPTHEntryInfo {
121 typedef PTHEntryKeyVariant key_type;
122 typedef key_type key_type_ref;
124 typedef PTHEntry data_type;
125 typedef const PTHEntry& data_type_ref;
127 typedef unsigned hash_value_type;
128 typedef unsigned offset_type;
130 static hash_value_type ComputeHash(PTHEntryKeyVariant V) {
131 return llvm::HashString(V.getString());
134 static std::pair<unsigned,unsigned>
135 EmitKeyDataLength(raw_ostream& Out, PTHEntryKeyVariant V,
137 using namespace llvm::support;
138 endian::Writer<little> LE(Out);
140 unsigned n = V.getString().size() + 1 + 1;
141 LE.write<uint16_t>(n);
143 unsigned m = V.getRepresentationLength() + (V.isFile() ? 4 + 4 : 0);
144 LE.write<uint8_t>(m);
146 return std::make_pair(n, m);
149 static void EmitKey(raw_ostream& Out, PTHEntryKeyVariant V, unsigned n){
150 using namespace llvm::support;
151 // Emit the entry kind.
152 endian::Writer<little>(Out).write<uint8_t>((unsigned)V.getKind());
154 Out.write(V.getString().data(), n - 1);
157 static void EmitData(raw_ostream& Out, PTHEntryKeyVariant V,
158 const PTHEntry& E, unsigned) {
159 using namespace llvm::support;
160 endian::Writer<little> LE(Out);
162 // For file entries emit the offsets into the PTH file for token data
163 // and the preprocessor blocks table.
165 LE.write<uint32_t>(E.getTokenOffset());
166 LE.write<uint32_t>(E.getPPCondTableOffset());
169 // Emit any other data associated with the key (i.e., stat information).
178 OffsetOpt() : valid(false) {}
179 bool hasOffset() const { return valid; }
180 Offset getOffset() const { assert(valid); return off; }
181 void setOffset(Offset o) { off = o; valid = true; }
183 } // end anonymous namespace
185 typedef llvm::OnDiskChainedHashTableGenerator<FileEntryPTHEntryInfo> PTHMap;
189 typedef llvm::DenseMap<const IdentifierInfo*,uint32_t> IDMap;
190 typedef llvm::StringMap<OffsetOpt, llvm::BumpPtrAllocator> CachedStrsTy;
192 raw_pwrite_stream &Out;
195 std::vector<llvm::StringMapEntry<OffsetOpt>*> StrEntries;
197 CachedStrsTy CachedStrs;
201 //// Get the persistent id for the given IdentifierInfo*.
202 uint32_t ResolveID(const IdentifierInfo* II);
204 /// Emit a token to the PTH file.
205 void EmitToken(const Token& T);
207 void Emit8(uint32_t V) {
208 using namespace llvm::support;
209 endian::Writer<little>(Out).write<uint8_t>(V);
212 void Emit16(uint32_t V) {
213 using namespace llvm::support;
214 endian::Writer<little>(Out).write<uint16_t>(V);
217 void Emit32(uint32_t V) {
218 using namespace llvm::support;
219 endian::Writer<little>(Out).write<uint32_t>(V);
222 void EmitBuf(const char *Ptr, unsigned NumBytes) {
223 Out.write(Ptr, NumBytes);
226 void EmitString(StringRef V) {
227 using namespace llvm::support;
228 endian::Writer<little>(Out).write<uint16_t>(V.size());
229 EmitBuf(V.data(), V.size());
232 /// EmitIdentifierTable - Emits two tables to the PTH file. The first is
233 /// a hashtable mapping from identifier strings to persistent IDs.
234 /// The second is a straight table mapping from persistent IDs to string data
235 /// (the keys of the first table).
236 std::pair<Offset, Offset> EmitIdentifierTable();
238 /// EmitFileTable - Emit a table mapping from file name strings to PTH
240 Offset EmitFileTable() { return PM.Emit(Out); }
242 PTHEntry LexTokens(Lexer& L);
243 Offset EmitCachedSpellings();
246 PTHWriter(raw_pwrite_stream &out, Preprocessor &pp)
247 : Out(out), PP(pp), idcount(0), CurStrOffset(0) {}
249 PTHMap &getPM() { return PM; }
250 void GeneratePTH(StringRef MainFile);
252 } // end anonymous namespace
254 uint32_t PTHWriter::ResolveID(const IdentifierInfo* II) {
255 // Null IdentifierInfo's map to the persistent ID 0.
259 IDMap::iterator I = IM.find(II);
261 return I->second; // We've already added 1.
263 IM[II] = ++idcount; // Pre-increment since '0' is reserved for NULL.
267 void PTHWriter::EmitToken(const Token& T) {
268 // Emit the token kind, flags, and length.
269 Emit32(((uint32_t) T.getKind()) | ((((uint32_t) T.getFlags())) << 8)|
270 (((uint32_t) T.getLength()) << 16));
272 if (!T.isLiteral()) {
273 Emit32(ResolveID(T.getIdentifierInfo()));
275 // We cache *un-cleaned* spellings. This gives us 100% fidelity with the
277 StringRef s(T.getLiteralData(), T.getLength());
279 // Get the string entry.
280 auto &E = *CachedStrs.insert(std::make_pair(s, OffsetOpt())).first;
282 // If this is a new string entry, bump the PTH offset.
283 if (!E.second.hasOffset()) {
284 E.second.setOffset(CurStrOffset);
285 StrEntries.push_back(&E);
286 CurStrOffset += s.size() + 1;
289 // Emit the relative offset into the PTH file for the spelling string.
290 Emit32(E.second.getOffset());
293 // Emit the offset into the original source file of this token so that we
294 // can reconstruct its SourceLocation.
295 Emit32(PP.getSourceManager().getFileOffset(T.getLocation()));
298 PTHEntry PTHWriter::LexTokens(Lexer& L) {
299 // Pad 0's so that we emit tokens to a 4-byte alignment.
300 // This speed up reading them back in.
301 using namespace llvm::support;
302 endian::Writer<little> LE(Out);
303 uint32_t TokenOff = Out.tell();
304 for (uint64_t N = llvm::OffsetToAlignment(TokenOff, 4); N; --N, ++TokenOff)
305 LE.write<uint8_t>(0);
307 // Keep track of matching '#if' ... '#endif'.
308 typedef std::vector<std::pair<Offset, unsigned> > PPCondTable;
310 std::vector<unsigned> PPStartCond;
311 bool ParsingPreprocessorDirective = false;
315 L.LexFromRawLexer(Tok);
318 if ((Tok.isAtStartOfLine() || Tok.is(tok::eof)) &&
319 ParsingPreprocessorDirective) {
320 // Insert an eod token into the token cache. It has the same
321 // position as the next token that is not on the same line as the
322 // preprocessor directive. Observe that we continue processing
323 // 'Tok' when we exit this branch.
325 Tmp.setKind(tok::eod);
326 Tmp.clearFlag(Token::StartOfLine);
327 Tmp.setIdentifierInfo(nullptr);
329 ParsingPreprocessorDirective = false;
332 if (Tok.is(tok::raw_identifier)) {
333 PP.LookUpIdentifierInfo(Tok);
338 if (Tok.is(tok::hash) && Tok.isAtStartOfLine()) {
339 // Special processing for #include. Store the '#' token and lex
341 assert(!ParsingPreprocessorDirective);
342 Offset HashOff = (Offset) Out.tell();
344 // Get the next token.
346 L.LexFromRawLexer(NextTok);
348 // If we see the start of line, then we had a null directive "#". In
349 // this case, discard both tokens.
350 if (NextTok.isAtStartOfLine())
353 // The token is the start of a directive. Emit it.
357 // Did we see 'include'/'import'/'include_next'?
358 if (Tok.isNot(tok::raw_identifier)) {
363 IdentifierInfo* II = PP.LookUpIdentifierInfo(Tok);
364 tok::PPKeywordKind K = II->getPPKeywordID();
366 ParsingPreprocessorDirective = true;
369 case tok::pp_not_keyword:
370 // Invalid directives "#foo" can occur in #if 0 blocks etc, just pass
375 case tok::pp_include:
377 case tok::pp_include_next: {
378 // Save the 'include' token.
380 // Lex the next token as an include string.
381 L.setParsingPreprocessorDirective(true);
382 L.LexIncludeFilename(Tok);
383 L.setParsingPreprocessorDirective(false);
384 assert(!Tok.isAtStartOfLine());
385 if (Tok.is(tok::raw_identifier))
386 PP.LookUpIdentifierInfo(Tok);
392 case tok::pp_ifndef: {
393 // Add an entry for '#if' and friends. We initially set the target
394 // index to 0. This will get backpatched when we hit #endif.
395 PPStartCond.push_back(PPCond.size());
396 PPCond.push_back(std::make_pair(HashOff, 0U));
399 case tok::pp_endif: {
400 // Add an entry for '#endif'. We set the target table index to itself.
401 // This will later be set to zero when emitting to the PTH file. We
402 // use 0 for uninitialized indices because that is easier to debug.
403 unsigned index = PPCond.size();
404 // Backpatch the opening '#if' entry.
405 assert(!PPStartCond.empty());
406 assert(PPCond.size() > PPStartCond.back());
407 assert(PPCond[PPStartCond.back()].second == 0);
408 PPCond[PPStartCond.back()].second = index;
409 PPStartCond.pop_back();
410 // Add the new entry to PPCond.
411 PPCond.push_back(std::make_pair(HashOff, index));
414 // Some files have gibberish on the same line as '#endif'.
415 // Discard these tokens.
417 L.LexFromRawLexer(Tok);
418 while (Tok.isNot(tok::eof) && !Tok.isAtStartOfLine());
419 // We have the next token in hand.
420 // Don't immediately lex the next one.
425 // Add an entry for #elif or #else.
426 // This serves as both a closing and opening of a conditional block.
427 // This means that its entry will get backpatched later.
428 unsigned index = PPCond.size();
429 // Backpatch the previous '#if' entry.
430 assert(!PPStartCond.empty());
431 assert(PPCond.size() > PPStartCond.back());
432 assert(PPCond[PPStartCond.back()].second == 0);
433 PPCond[PPStartCond.back()].second = index;
434 PPStartCond.pop_back();
435 // Now add '#elif' as a new block opening.
436 PPCond.push_back(std::make_pair(HashOff, 0U));
437 PPStartCond.push_back(index);
445 while (Tok.isNot(tok::eof));
447 assert(PPStartCond.empty() && "Error: imblanced preprocessor conditionals.");
449 // Next write out PPCond.
450 Offset PPCondOff = (Offset) Out.tell();
452 // Write out the size of PPCond so that clients can identifer empty tables.
453 Emit32(PPCond.size());
455 for (unsigned i = 0, e = PPCond.size(); i!=e; ++i) {
456 Emit32(PPCond[i].first - TokenOff);
457 uint32_t x = PPCond[i].second;
458 assert(x != 0 && "PPCond entry not backpatched.");
459 // Emit zero for #endifs. This allows us to do checking when
460 // we read the PTH file back in.
461 Emit32(x == i ? 0 : x);
464 return PTHEntry(TokenOff, PPCondOff);
467 Offset PTHWriter::EmitCachedSpellings() {
468 // Write each cached strings to the PTH file.
469 Offset SpellingsOff = Out.tell();
471 for (std::vector<llvm::StringMapEntry<OffsetOpt>*>::iterator
472 I = StrEntries.begin(), E = StrEntries.end(); I!=E; ++I)
473 EmitBuf((*I)->getKeyData(), (*I)->getKeyLength()+1 /*nul included*/);
478 static uint32_t swap32le(uint32_t X) {
479 return llvm::support::endian::byte_swap<uint32_t, llvm::support::little>(X);
482 static void pwrite32le(raw_pwrite_stream &OS, uint32_t Val, uint64_t &Off) {
483 uint32_t LEVal = swap32le(Val);
484 OS.pwrite(reinterpret_cast<const char *>(&LEVal), 4, Off);
488 void PTHWriter::GeneratePTH(StringRef MainFile) {
489 // Generate the prologue.
490 Out << "cfe-pth" << '\0';
491 Emit32(PTHManager::Version);
493 // Leave 4 words for the prologue.
494 Offset PrologueOffset = Out.tell();
495 for (unsigned i = 0; i < 4; ++i)
498 // Write the name of the MainFile.
499 if (!MainFile.empty()) {
500 EmitString(MainFile);
502 // String with 0 bytes.
507 // Iterate over all the files in SourceManager. Create a lexer
508 // for each file and cache the tokens.
509 SourceManager &SM = PP.getSourceManager();
510 const LangOptions &LOpts = PP.getLangOpts();
512 for (SourceManager::fileinfo_iterator I = SM.fileinfo_begin(),
513 E = SM.fileinfo_end(); I != E; ++I) {
514 const SrcMgr::ContentCache &C = *I->second;
515 const FileEntry *FE = C.OrigEntry;
517 // FIXME: Handle files with non-absolute paths.
518 if (llvm::sys::path::is_relative(FE->getName()))
521 const llvm::MemoryBuffer *B = C.getBuffer(PP.getDiagnostics(), SM);
524 FileID FID = SM.createFileID(FE, SourceLocation(), SrcMgr::C_User);
525 const llvm::MemoryBuffer *FromFile = SM.getBuffer(FID);
526 Lexer L(FID, FromFile, SM, LOpts);
527 PM.insert(FE, LexTokens(L));
530 // Write out the identifier table.
531 const std::pair<Offset,Offset> &IdTableOff = EmitIdentifierTable();
533 // Write out the cached strings table.
534 Offset SpellingOff = EmitCachedSpellings();
536 // Write out the file table.
537 Offset FileTableOff = EmitFileTable();
539 // Finally, write the prologue.
540 uint64_t Off = PrologueOffset;
541 pwrite32le(Out, IdTableOff.first, Off);
542 pwrite32le(Out, IdTableOff.second, Off);
543 pwrite32le(Out, FileTableOff, Off);
544 pwrite32le(Out, SpellingOff, Off);
548 /// StatListener - A simple "interpose" object used to monitor stat calls
549 /// invoked by FileManager while processing the original sources used
550 /// as input to PTH generation. StatListener populates the PTHWriter's
551 /// file map with stat information for directories as well as negative stats.
552 /// Stat information for files are populated elsewhere.
553 class StatListener : public FileSystemStatCache {
556 StatListener(PTHMap &pm) : PM(pm) {}
557 ~StatListener() override {}
559 LookupResult getStat(StringRef Path, FileData &Data, bool isFile,
560 std::unique_ptr<vfs::File> *F,
561 vfs::FileSystem &FS) override {
562 LookupResult Result = statChained(Path, Data, isFile, F, FS);
564 if (Result == CacheMissing) // Failed 'stat'.
565 PM.insert(PTHEntryKeyVariant(Path), PTHEntry());
566 else if (Data.IsDirectory) {
567 // Only cache directories with absolute paths.
568 if (llvm::sys::path::is_relative(Path))
571 PM.insert(PTHEntryKeyVariant(&Data, Path), PTHEntry());
577 } // end anonymous namespace
579 void clang::CacheTokens(Preprocessor &PP, raw_pwrite_stream *OS) {
580 // Get the name of the main file.
581 const SourceManager &SrcMgr = PP.getSourceManager();
582 const FileEntry *MainFile = SrcMgr.getFileEntryForID(SrcMgr.getMainFileID());
583 SmallString<128> MainFilePath(MainFile->getName());
585 llvm::sys::fs::make_absolute(MainFilePath);
587 // Create the PTHWriter.
588 PTHWriter PW(*OS, PP);
590 // Install the 'stat' system call listener in the FileManager.
591 auto StatCacheOwner = llvm::make_unique<StatListener>(PW.getPM());
592 StatListener *StatCache = StatCacheOwner.get();
593 PP.getFileManager().addStatCache(std::move(StatCacheOwner),
594 /*AtBeginning=*/true);
596 // Lex through the entire file. This will populate SourceManager with
597 // all of the header information.
599 PP.EnterMainSourceFile();
600 do { PP.Lex(Tok); } while (Tok.isNot(tok::eof));
602 // Generate the PTH file.
603 PP.getFileManager().removeStatCache(StatCache);
604 PW.GeneratePTH(MainFilePath.str());
607 //===----------------------------------------------------------------------===//
612 const IdentifierInfo* II;
616 class PTHIdentifierTableTrait {
618 typedef PTHIdKey* key_type;
619 typedef key_type key_type_ref;
621 typedef uint32_t data_type;
622 typedef data_type data_type_ref;
624 typedef unsigned hash_value_type;
625 typedef unsigned offset_type;
627 static hash_value_type ComputeHash(PTHIdKey* key) {
628 return llvm::HashString(key->II->getName());
631 static std::pair<unsigned,unsigned>
632 EmitKeyDataLength(raw_ostream& Out, const PTHIdKey* key, uint32_t) {
633 using namespace llvm::support;
634 unsigned n = key->II->getLength() + 1;
635 endian::Writer<little>(Out).write<uint16_t>(n);
636 return std::make_pair(n, sizeof(uint32_t));
639 static void EmitKey(raw_ostream& Out, PTHIdKey* key, unsigned n) {
640 // Record the location of the key data. This is used when generating
641 // the mapping from persistent IDs to strings.
642 key->FileOffset = Out.tell();
643 Out.write(key->II->getNameStart(), n);
646 static void EmitData(raw_ostream& Out, PTHIdKey*, uint32_t pID,
648 using namespace llvm::support;
649 endian::Writer<little>(Out).write<uint32_t>(pID);
652 } // end anonymous namespace
654 /// EmitIdentifierTable - Emits two tables to the PTH file. The first is
655 /// a hashtable mapping from identifier strings to persistent IDs. The second
656 /// is a straight table mapping from persistent IDs to string data (the
657 /// keys of the first table).
659 std::pair<Offset,Offset> PTHWriter::EmitIdentifierTable() {
661 // (1) an inverse map from persistent IDs -> (IdentifierInfo*,Offset)
662 // (2) a map from (IdentifierInfo*, Offset)* -> persistent IDs
664 // Note that we use 'calloc', so all the bytes are 0.
665 PTHIdKey *IIDMap = (PTHIdKey*)calloc(idcount, sizeof(PTHIdKey));
667 // Create the hashtable.
668 llvm::OnDiskChainedHashTableGenerator<PTHIdentifierTableTrait> IIOffMap;
670 // Generate mapping from persistent IDs -> IdentifierInfo*.
671 for (IDMap::iterator I = IM.begin(), E = IM.end(); I != E; ++I) {
672 // Decrement by 1 because we are using a vector for the lookup and
673 // 0 is reserved for NULL.
674 assert(I->second > 0);
675 assert(I->second-1 < idcount);
676 unsigned idx = I->second-1;
678 // Store the mapping from persistent ID to IdentifierInfo*
679 IIDMap[idx].II = I->first;
681 // Store the reverse mapping in a hashtable.
682 IIOffMap.insert(&IIDMap[idx], I->second);
685 // Write out the inverse map first. This causes the PCIDKey entries to
686 // record PTH file offsets for the string data. This is used to write
688 Offset StringTableOffset = IIOffMap.Emit(Out);
690 // Now emit the table mapping from persistent IDs to PTH file offsets.
691 Offset IDOff = Out.tell();
692 Emit32(idcount); // Emit the number of identifiers.
693 for (unsigned i = 0 ; i < idcount; ++i)
694 Emit32(IIDMap[i].FileOffset);
696 // Finally, release the inverse map.
699 return std::make_pair(IDOff, StringTableOffset);