1 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief Defines facilities for reading and writing on-disk hash tables.
13 //===----------------------------------------------------------------------===//
14 #ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
15 #define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
17 #include "clang/Basic/LLVM.h"
18 #include "llvm/Support/Allocator.h"
19 #include "llvm/Support/DataTypes.h"
20 #include "llvm/Support/Host.h"
21 #include "llvm/Support/MathExtras.h"
22 #include "llvm/Support/raw_ostream.h"
30 typedef uint32_t Offset;
32 inline void Emit8(raw_ostream& Out, uint32_t V) {
33 Out << (unsigned char)(V);
36 inline void Emit16(raw_ostream& Out, uint32_t V) {
37 Out << (unsigned char)(V);
38 Out << (unsigned char)(V >> 8);
39 assert((V >> 16) == 0);
42 inline void Emit24(raw_ostream& Out, uint32_t V) {
43 Out << (unsigned char)(V);
44 Out << (unsigned char)(V >> 8);
45 Out << (unsigned char)(V >> 16);
46 assert((V >> 24) == 0);
49 inline void Emit32(raw_ostream& Out, uint32_t V) {
50 Out << (unsigned char)(V);
51 Out << (unsigned char)(V >> 8);
52 Out << (unsigned char)(V >> 16);
53 Out << (unsigned char)(V >> 24);
56 inline void Emit64(raw_ostream& Out, uint64_t V) {
57 Out << (unsigned char)(V);
58 Out << (unsigned char)(V >> 8);
59 Out << (unsigned char)(V >> 16);
60 Out << (unsigned char)(V >> 24);
61 Out << (unsigned char)(V >> 32);
62 Out << (unsigned char)(V >> 40);
63 Out << (unsigned char)(V >> 48);
64 Out << (unsigned char)(V >> 56);
67 inline void Pad(raw_ostream& Out, unsigned A) {
68 Offset off = (Offset) Out.tell();
69 for (uint32_t n = llvm::OffsetToAlignment(off, A); n; --n)
73 inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) {
74 uint16_t V = ((uint16_t)Data[0]) |
75 ((uint16_t)Data[1] << 8);
80 inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) {
81 uint32_t V = ((uint32_t)Data[0]) |
82 ((uint32_t)Data[1] << 8) |
83 ((uint32_t)Data[2] << 16) |
84 ((uint32_t)Data[3] << 24);
89 inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) {
90 uint64_t V = ((uint64_t)Data[0]) |
91 ((uint64_t)Data[1] << 8) |
92 ((uint64_t)Data[2] << 16) |
93 ((uint64_t)Data[3] << 24) |
94 ((uint64_t)Data[4] << 32) |
95 ((uint64_t)Data[5] << 40) |
96 ((uint64_t)Data[6] << 48) |
97 ((uint64_t)Data[7] << 56);
102 inline uint32_t ReadLE32(const unsigned char *&Data) {
103 // Hosts that directly support little-endian 32-bit loads can just
104 // use them. Big-endian hosts need a bswap.
105 uint32_t V = *((const uint32_t*)Data);
106 if (llvm::sys::IsBigEndianHost)
107 V = llvm::ByteSwap_32(V);
112 } // end namespace io
114 template<typename Info>
115 class OnDiskChainedHashTableGenerator {
118 llvm::BumpPtrAllocator BA;
122 typename Info::key_type key;
123 typename Info::data_type data;
127 Item(typename Info::key_type_ref k, typename Info::data_type_ref d,
129 : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {}
144 void insert(Bucket* b, size_t size, Item* E) {
145 unsigned idx = E->hash & (size - 1);
152 void resize(size_t newsize) {
153 Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket));
154 // Populate newBuckets with the old entries.
155 for (unsigned i = 0; i < NumBuckets; ++i)
156 for (Item* E = Buckets[i].head; E ; ) {
159 insert(newBuckets, newsize, E);
164 NumBuckets = newsize;
165 Buckets = newBuckets;
170 void insert(typename Info::key_type_ref key,
171 typename Info::data_type_ref data) {
173 insert(key, data, InfoObj);
176 void insert(typename Info::key_type_ref key,
177 typename Info::data_type_ref data, Info &InfoObj) {
180 if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2);
181 insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data,
185 io::Offset Emit(raw_ostream &out) {
187 return Emit(out, InfoObj);
190 io::Offset Emit(raw_ostream &out, Info &InfoObj) {
191 using namespace clang::io;
193 // Emit the payload of the table.
194 for (unsigned i = 0; i < NumBuckets; ++i) {
195 Bucket& B = Buckets[i];
196 if (!B.head) continue;
198 // Store the offset for the data of this bucket.
200 assert(B.off && "Cannot write a bucket at offset 0. Please add padding.");
202 // Write out the number of items in the bucket.
203 Emit16(out, B.length);
204 assert(B.length != 0 && "Bucket has a head but zero length?");
206 // Write out the entries in the bucket.
207 for (Item *I = B.head; I ; I = I->next) {
208 Emit32(out, I->hash);
209 const std::pair<unsigned, unsigned>& Len =
210 InfoObj.EmitKeyDataLength(out, I->key, I->data);
211 InfoObj.EmitKey(out, I->key, Len.first);
212 InfoObj.EmitData(out, I->key, I->data, Len.second);
216 // Emit the hashtable itself.
218 io::Offset TableOff = out.tell();
219 Emit32(out, NumBuckets);
220 Emit32(out, NumEntries);
221 for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off);
226 OnDiskChainedHashTableGenerator() {
229 // Note that we do not need to run the constructors of the individual
230 // Bucket objects since 'calloc' returns bytes that are all 0.
231 Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket));
234 ~OnDiskChainedHashTableGenerator() {
239 template<typename Info>
240 class OnDiskChainedHashTable {
241 const unsigned NumBuckets;
242 const unsigned NumEntries;
243 const unsigned char* const Buckets;
244 const unsigned char* const Base;
248 typedef typename Info::internal_key_type internal_key_type;
249 typedef typename Info::external_key_type external_key_type;
250 typedef typename Info::data_type data_type;
252 OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries,
253 const unsigned char* buckets,
254 const unsigned char* base,
255 const Info &InfoObj = Info())
256 : NumBuckets(numBuckets), NumEntries(numEntries),
257 Buckets(buckets), Base(base), InfoObj(InfoObj) {
258 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
259 "'buckets' must have a 4-byte alignment");
262 unsigned getNumBuckets() const { return NumBuckets; }
263 unsigned getNumEntries() const { return NumEntries; }
264 const unsigned char* getBase() const { return Base; }
265 const unsigned char* getBuckets() const { return Buckets; }
267 bool isEmpty() const { return NumEntries == 0; }
270 internal_key_type key;
271 const unsigned char* const data;
275 iterator() : data(0), len(0) {}
276 iterator(const internal_key_type k, const unsigned char* d, unsigned l,
278 : key(k), data(d), len(l), InfoObj(InfoObj) {}
280 data_type operator*() const { return InfoObj->ReadData(key, data, len); }
281 bool operator==(const iterator& X) const { return X.data == data; }
282 bool operator!=(const iterator& X) const { return X.data != data; }
285 iterator find(const external_key_type& eKey, Info *InfoPtr = 0) {
290 const internal_key_type& iKey = InfoObj.GetInternalKey(eKey);
291 unsigned key_hash = InfoObj.ComputeHash(iKey);
293 // Each bucket is just a 32-bit offset into the hash table file.
294 unsigned idx = key_hash & (NumBuckets - 1);
295 const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx;
297 unsigned offset = ReadLE32(Bucket);
298 if (offset == 0) return iterator(); // Empty bucket.
299 const unsigned char* Items = Base + offset;
301 // 'Items' starts with a 16-bit unsigned integer representing the
302 // number of items in this bucket.
303 unsigned len = ReadUnalignedLE16(Items);
305 for (unsigned i = 0; i < len; ++i) {
307 uint32_t item_hash = ReadUnalignedLE32(Items);
309 // Determine the length of the key and the data.
310 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items);
311 unsigned item_len = L.first + L.second;
313 // Compare the hashes. If they are not the same, skip the entry entirely.
314 if (item_hash != key_hash) {
320 const internal_key_type& X =
321 InfoPtr->ReadKey((const unsigned char* const) Items, L.first);
323 // If the key doesn't match just skip reading the value.
324 if (!InfoPtr->EqualKey(X, iKey)) {
330 return iterator(X, Items + L.first, L.second, InfoPtr);
336 iterator end() const { return iterator(); }
338 /// \brief Iterates over all of the keys in the table.
340 const unsigned char* Ptr;
341 unsigned NumItemsInBucketLeft;
342 unsigned NumEntriesLeft;
345 typedef external_key_type value_type;
347 key_iterator(const unsigned char* const Ptr, unsigned NumEntries,
349 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
352 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
354 friend bool operator==(const key_iterator &X, const key_iterator &Y) {
355 return X.NumEntriesLeft == Y.NumEntriesLeft;
357 friend bool operator!=(const key_iterator& X, const key_iterator &Y) {
358 return X.NumEntriesLeft != Y.NumEntriesLeft;
361 key_iterator& operator++() { // Preincrement
362 if (!NumItemsInBucketLeft) {
363 // 'Items' starts with a 16-bit unsigned integer representing the
364 // number of items in this bucket.
365 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
367 Ptr += 4; // Skip the hash.
368 // Determine the length of the key and the data.
369 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
370 Ptr += L.first + L.second;
371 assert(NumItemsInBucketLeft);
372 --NumItemsInBucketLeft;
373 assert(NumEntriesLeft);
377 key_iterator operator++(int) { // Postincrement
378 key_iterator tmp = *this; ++*this; return tmp;
381 value_type operator*() const {
382 const unsigned char* LocalPtr = Ptr;
383 if (!NumItemsInBucketLeft)
384 LocalPtr += 2; // number of items in bucket
385 LocalPtr += 4; // Skip the hash.
387 // Determine the length of the key and the data.
388 const std::pair<unsigned, unsigned>& L
389 = Info::ReadKeyDataLength(LocalPtr);
392 const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first);
393 return InfoObj->GetExternalKey(Key);
397 key_iterator key_begin() {
398 return key_iterator(Base + 4, getNumEntries(), &InfoObj);
400 key_iterator key_end() { return key_iterator(); }
402 /// \brief Iterates over all the entries in the table, returning the data.
403 class data_iterator {
404 const unsigned char* Ptr;
405 unsigned NumItemsInBucketLeft;
406 unsigned NumEntriesLeft;
409 typedef data_type value_type;
411 data_iterator(const unsigned char* const Ptr, unsigned NumEntries,
413 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
416 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
418 bool operator==(const data_iterator& X) const {
419 return X.NumEntriesLeft == NumEntriesLeft;
421 bool operator!=(const data_iterator& X) const {
422 return X.NumEntriesLeft != NumEntriesLeft;
425 data_iterator& operator++() { // Preincrement
426 if (!NumItemsInBucketLeft) {
427 // 'Items' starts with a 16-bit unsigned integer representing the
428 // number of items in this bucket.
429 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
431 Ptr += 4; // Skip the hash.
432 // Determine the length of the key and the data.
433 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
434 Ptr += L.first + L.second;
435 assert(NumItemsInBucketLeft);
436 --NumItemsInBucketLeft;
437 assert(NumEntriesLeft);
441 data_iterator operator++(int) { // Postincrement
442 data_iterator tmp = *this; ++*this; return tmp;
445 value_type operator*() const {
446 const unsigned char* LocalPtr = Ptr;
447 if (!NumItemsInBucketLeft)
448 LocalPtr += 2; // number of items in bucket
449 LocalPtr += 4; // Skip the hash.
451 // Determine the length of the key and the data.
452 const std::pair<unsigned, unsigned>& L =Info::ReadKeyDataLength(LocalPtr);
455 const internal_key_type& Key =
456 InfoObj->ReadKey(LocalPtr, L.first);
457 return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
461 data_iterator data_begin() {
462 return data_iterator(Base + 4, getNumEntries(), &InfoObj);
464 data_iterator data_end() { return data_iterator(); }
466 Info &getInfoObj() { return InfoObj; }
468 static OnDiskChainedHashTable* Create(const unsigned char* buckets,
469 const unsigned char* const base,
470 const Info &InfoObj = Info()) {
472 assert(buckets > base);
473 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
474 "buckets should be 4-byte aligned.");
476 unsigned numBuckets = ReadLE32(buckets);
477 unsigned numEntries = ReadLE32(buckets);
478 return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets,
483 } // end namespace clang