1 //===- MultiOnDiskHashTable.h - Merged set of hash tables -------*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file provides a hash table data structure suitable for incremental and
11 // distributed storage across a set of files.
13 // Multiple hash tables from different files are implicitly merged to improve
14 // performance, and on reload the merged table will override those from other
17 //===----------------------------------------------------------------------===//
19 #ifndef LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
20 #define LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H
22 #include "llvm/ADT/DenseMap.h"
23 #include "llvm/ADT/DenseSet.h"
24 #include "llvm/ADT/PointerUnion.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallVector.h"
27 #include "llvm/ADT/TinyPtrVector.h"
28 #include "llvm/ADT/iterator_range.h"
29 #include "llvm/Support/Endian.h"
30 #include "llvm/Support/EndianStream.h"
31 #include "llvm/Support/OnDiskHashTable.h"
32 #include "llvm/Support/raw_ostream.h"
38 namespace serialization {
40 /// A collection of on-disk hash tables, merged when relevant for performance.
41 template<typename Info> class MultiOnDiskHashTable {
43 /// A handle to a file, used when overriding tables.
44 using file_type = typename Info::file_type;
46 /// A pointer to an on-disk representation of the hash table.
47 using storage_type = const unsigned char *;
49 using external_key_type = typename Info::external_key_type;
50 using internal_key_type = typename Info::internal_key_type;
51 using data_type = typename Info::data_type;
52 using data_type_builder = typename Info::data_type_builder;
53 using hash_value_type = unsigned;
56 /// The generator is permitted to read our merged table.
57 template<typename ReaderInfo, typename WriterInfo>
58 friend class MultiOnDiskHashTableGenerator;
60 /// A hash table stored on disk.
62 using HashTable = llvm::OnDiskIterableChainedHashTable<Info>;
67 OnDiskTable(file_type File, unsigned NumBuckets, unsigned NumEntries,
68 storage_type Buckets, storage_type Payload, storage_type Base,
71 Table(NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj) {}
75 std::vector<file_type> Files;
76 llvm::DenseMap<internal_key_type, data_type> Data;
79 using Table = llvm::PointerUnion<OnDiskTable *, MergedTable *>;
80 using TableVector = llvm::TinyPtrVector<void *>;
82 /// The current set of on-disk and merged tables.
83 /// We manually store the opaque value of the Table because TinyPtrVector
84 /// can't cope with holding a PointerUnion directly.
85 /// There can be at most one MergedTable in this vector, and if present,
86 /// it is the first table.
89 /// Files corresponding to overridden tables that we've not yet
91 llvm::TinyPtrVector<file_type> PendingOverrides;
93 struct AsOnDiskTable {
94 using result_type = OnDiskTable *;
96 result_type operator()(void *P) const {
97 return Table::getFromOpaqueValue(P).template get<OnDiskTable *>();
101 using table_iterator =
102 llvm::mapped_iterator<TableVector::iterator, AsOnDiskTable>;
103 using table_range = llvm::iterator_range<table_iterator>;
105 /// The current set of on-disk tables.
106 table_range tables() {
107 auto Begin = Tables.begin(), End = Tables.end();
108 if (getMergedTable())
110 return llvm::make_range(llvm::map_iterator(Begin, AsOnDiskTable()),
111 llvm::map_iterator(End, AsOnDiskTable()));
114 MergedTable *getMergedTable() const {
115 // If we already have a merged table, it's the first one.
116 return Tables.empty() ? nullptr : Table::getFromOpaqueValue(*Tables.begin())
117 .template dyn_cast<MergedTable*>();
120 /// Delete all our current on-disk tables.
122 for (auto *T : tables())
124 if (auto *M = getMergedTable())
129 void removeOverriddenTables() {
130 llvm::DenseSet<file_type> Files;
131 Files.insert(PendingOverrides.begin(), PendingOverrides.end());
132 // Explicitly capture Files to work around an MSVC 2015 rejects-valid bug.
133 auto ShouldRemove = [&Files](void *T) -> bool {
134 auto *ODT = Table::getFromOpaqueValue(T).template get<OnDiskTable *>();
135 bool Remove = Files.count(ODT->File);
140 Tables.erase(std::remove_if(tables().begin().getCurrent(), Tables.end(),
143 PendingOverrides.clear();
147 MergedTable *Merged = getMergedTable();
149 Merged = new MergedTable;
151 // Read in all the tables and merge them together.
152 // FIXME: Be smarter about which tables we merge.
153 for (auto *ODT : tables()) {
154 auto &HT = ODT->Table;
155 Info &InfoObj = HT.getInfoObj();
157 for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
158 auto *LocalPtr = I.getItem();
160 // FIXME: Don't rely on the OnDiskHashTable format here.
161 auto L = InfoObj.ReadKeyDataLength(LocalPtr);
162 const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
163 data_type_builder ValueBuilder(Merged->Data[Key]);
164 InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second,
168 Merged->Files.push_back(ODT->File);
173 Tables.push_back(Table(Merged).getOpaqueValue());
177 MultiOnDiskHashTable() = default;
179 MultiOnDiskHashTable(MultiOnDiskHashTable &&O)
180 : Tables(std::move(O.Tables)),
181 PendingOverrides(std::move(O.PendingOverrides)) {
185 MultiOnDiskHashTable &operator=(MultiOnDiskHashTable &&O) {
189 Tables = std::move(O.Tables);
191 PendingOverrides = std::move(O.PendingOverrides);
195 ~MultiOnDiskHashTable() { clear(); }
197 /// Add the table \p Data loaded from file \p File.
198 void add(file_type File, storage_type Data, Info InfoObj = Info()) {
199 using namespace llvm::support;
201 storage_type Ptr = Data;
203 uint32_t BucketOffset = endian::readNext<uint32_t, little, unaligned>(Ptr);
205 // Read the list of overridden files.
206 uint32_t NumFiles = endian::readNext<uint32_t, little, unaligned>(Ptr);
207 // FIXME: Add a reserve() to TinyPtrVector so that we don't need to make
208 // an additional copy.
209 llvm::SmallVector<file_type, 16> OverriddenFiles;
210 OverriddenFiles.reserve(NumFiles);
211 for (/**/; NumFiles != 0; --NumFiles)
212 OverriddenFiles.push_back(InfoObj.ReadFileRef(Ptr));
213 PendingOverrides.insert(PendingOverrides.end(), OverriddenFiles.begin(),
214 OverriddenFiles.end());
216 // Read the OnDiskChainedHashTable header.
217 storage_type Buckets = Data + BucketOffset;
218 auto NumBucketsAndEntries =
219 OnDiskTable::HashTable::readNumBucketsAndEntries(Buckets);
221 // Register the table.
222 Table NewTable = new OnDiskTable(File, NumBucketsAndEntries.first,
223 NumBucketsAndEntries.second,
224 Buckets, Ptr, Data, std::move(InfoObj));
225 Tables.push_back(NewTable.getOpaqueValue());
228 /// Find and read the lookup results for \p EKey.
229 data_type find(const external_key_type &EKey) {
232 if (!PendingOverrides.empty())
233 removeOverriddenTables();
235 if (Tables.size() > static_cast<unsigned>(Info::MaxTables))
238 internal_key_type Key = Info::GetInternalKey(EKey);
239 auto KeyHash = Info::ComputeHash(Key);
241 if (MergedTable *M = getMergedTable()) {
242 auto It = M->Data.find(Key);
243 if (It != M->Data.end())
247 data_type_builder ResultBuilder(Result);
249 for (auto *ODT : tables()) {
250 auto &HT = ODT->Table;
251 auto It = HT.find_hashed(Key, KeyHash);
253 HT.getInfoObj().ReadDataInto(Key, It.getDataPtr(), It.getDataLen(),
260 /// Read all the lookup results into a single value. This only makes
261 /// sense if merging values across keys is meaningful.
262 data_type findAll() {
264 data_type_builder ResultBuilder(Result);
266 if (!PendingOverrides.empty())
267 removeOverriddenTables();
269 if (MergedTable *M = getMergedTable()) {
270 for (auto &KV : M->Data)
271 Info::MergeDataInto(KV.second, ResultBuilder);
274 for (auto *ODT : tables()) {
275 auto &HT = ODT->Table;
276 Info &InfoObj = HT.getInfoObj();
277 for (auto I = HT.data_begin(), E = HT.data_end(); I != E; ++I) {
278 auto *LocalPtr = I.getItem();
280 // FIXME: Don't rely on the OnDiskHashTable format here.
281 auto L = InfoObj.ReadKeyDataLength(LocalPtr);
282 const internal_key_type &Key = InfoObj.ReadKey(LocalPtr, L.first);
283 InfoObj.ReadDataInto(Key, LocalPtr + L.first, L.second, ResultBuilder);
291 /// Writer for the on-disk hash table.
292 template<typename ReaderInfo, typename WriterInfo>
293 class MultiOnDiskHashTableGenerator {
294 using BaseTable = MultiOnDiskHashTable<ReaderInfo>;
295 using Generator = llvm::OnDiskChainedHashTableGenerator<WriterInfo>;
300 MultiOnDiskHashTableGenerator() : Gen() {}
302 void insert(typename WriterInfo::key_type_ref Key,
303 typename WriterInfo::data_type_ref Data, WriterInfo &Info) {
304 Gen.insert(Key, Data, Info);
307 void emit(llvm::SmallVectorImpl<char> &Out, WriterInfo &Info,
308 const BaseTable *Base) {
309 using namespace llvm::support;
311 llvm::raw_svector_ostream OutStream(Out);
313 // Write our header information.
315 endian::Writer Writer(OutStream, little);
317 // Reserve four bytes for the bucket offset.
318 Writer.write<uint32_t>(0);
320 if (auto *Merged = Base ? Base->getMergedTable() : nullptr) {
321 // Write list of overridden files.
322 Writer.write<uint32_t>(Merged->Files.size());
323 for (const auto &F : Merged->Files)
324 Info.EmitFileRef(OutStream, F);
326 // Add all merged entries from Base to the generator.
327 for (auto &KV : Merged->Data) {
328 if (!Gen.contains(KV.first, Info))
329 Gen.insert(KV.first, Info.ImportData(KV.second), Info);
332 Writer.write<uint32_t>(0);
336 // Write the table itself.
337 uint32_t BucketOffset = Gen.Emit(OutStream, Info);
339 // Replace the first four bytes with the bucket offset.
340 endian::write32le(Out.data(), BucketOffset);
344 } // namespace serialization
347 #endif // LLVM_CLANG_LIB_SERIALIZATION_MULTIONDISKHASHTABLE_H