1
//===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
3
// The LLVM Compiler Infrastructure
5
// This file is distributed under the University of Illinois Open Source
6
// License. See LICENSE.TXT for details.
8
//===----------------------------------------------------------------------===//
11
/// \brief Defines facilities for reading and writing on-disk hash tables.
13
//===----------------------------------------------------------------------===//
14
#ifndef LLVM_SUPPORT_ONDISKHASHTABLE_H
15
#define LLVM_SUPPORT_ONDISKHASHTABLE_H
17
#include "llvm/Support/AlignOf.h"
18
#include "llvm/Support/Allocator.h"
19
#include "llvm/Support/DataTypes.h"
20
#include "llvm/Support/EndianStream.h"
21
#include "llvm/Support/Host.h"
22
#include "llvm/Support/MathExtras.h"
23
#include "llvm/Support/raw_ostream.h"
29
/// \brief Generates an on disk hash table.
31
/// This needs an \c Info that handles storing values into the hash table's
32
/// payload and computes the hash for a given key. This should provide the
33
/// following interface:
36
/// class ExampleInfo {
38
/// typedef ExampleKey key_type; // Must be copy constructible
39
/// typedef ExampleKey &key_type_ref;
40
/// typedef ExampleData data_type; // Must be copy constructible
41
/// typedef ExampleData &data_type_ref;
42
/// typedef uint32_t hash_value_type; // The type the hash function returns.
43
/// typedef uint32_t offset_type; // The type for offsets into the table.
45
/// /// Calculate the hash for Key
46
/// static hash_value_type ComputeHash(key_type_ref Key);
47
/// /// Return the lengths, in bytes, of the given Key/Data pair.
48
/// static std::pair<offset_type, offset_type>
49
/// EmitKeyDataLength(raw_ostream &Out, key_type_ref Key, data_type_ref Data);
50
/// /// Write Key to Out. KeyLen is the length from EmitKeyDataLength.
51
/// static void EmitKey(raw_ostream &Out, key_type_ref Key,
52
/// offset_type KeyLen);
53
/// /// Write Data to Out. DataLen is the length from EmitKeyDataLength.
54
/// static void EmitData(raw_ostream &Out, key_type_ref Key,
55
/// data_type_ref Data, offset_type DataLen);
58
template <typename Info> class OnDiskChainedHashTableGenerator {
59
/// \brief A single item in the hash table.
62
typename Info::key_type Key;
63
typename Info::data_type Data;
65
const typename Info::hash_value_type Hash;
67
Item(typename Info::key_type_ref Key, typename Info::data_type_ref Data,
69
: Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {}
72
typedef typename Info::offset_type offset_type;
73
offset_type NumBuckets;
74
offset_type NumEntries;
75
llvm::SpecificBumpPtrAllocator<Item> BA;
77
/// \brief A linked list of values in a particular hash bucket.
87
/// \brief Insert an item into the appropriate hash bucket.
88
void insert(Bucket *Buckets, size_t Size, Item *E) {
89
Bucket &B = Buckets[E->Hash & (Size - 1)];
95
/// \brief Resize the hash table, moving the old entries into the new buckets.
96
void resize(size_t NewSize) {
97
Bucket *NewBuckets = (Bucket *)std::calloc(NewSize, sizeof(Bucket));
98
// Populate NewBuckets with the old entries.
99
for (size_t I = 0; I < NumBuckets; ++I)
100
for (Item *E = Buckets[I].Head; E;) {
103
insert(NewBuckets, NewSize, E);
108
NumBuckets = NewSize;
109
Buckets = NewBuckets;
113
/// \brief Insert an entry into the table.
114
void insert(typename Info::key_type_ref Key,
115
typename Info::data_type_ref Data) {
117
insert(Key, Data, InfoObj);
120
/// \brief Insert an entry into the table.
122
/// Uses the provided Info instead of a stack allocated one.
123
void insert(typename Info::key_type_ref Key,
124
typename Info::data_type_ref Data, Info &InfoObj) {
127
if (4 * NumEntries >= 3 * NumBuckets)
128
resize(NumBuckets * 2);
129
insert(Buckets, NumBuckets, new (BA.Allocate()) Item(Key, Data, InfoObj));
132
/// \brief Emit the table to Out, which must not be at offset 0.
133
offset_type Emit(raw_ostream &Out) {
135
return Emit(Out, InfoObj);
138
/// \brief Emit the table to Out, which must not be at offset 0.
140
/// Uses the provided Info instead of a stack allocated one.
141
offset_type Emit(raw_ostream &Out, Info &InfoObj) {
142
using namespace llvm::support;
143
endian::Writer<little> LE(Out);
145
// Emit the payload of the table.
146
for (offset_type I = 0; I < NumBuckets; ++I) {
147
Bucket &B = Buckets[I];
151
// Store the offset for the data of this bucket.
153
assert(B.Off && "Cannot write a bucket at offset 0. Please add padding.");
155
// Write out the number of items in the bucket.
156
LE.write<uint16_t>(B.Length);
157
assert(B.Length != 0 && "Bucket has a head but zero length?");
159
// Write out the entries in the bucket.
160
for (Item *I = B.Head; I; I = I->Next) {
161
LE.write<typename Info::hash_value_type>(I->Hash);
162
const std::pair<offset_type, offset_type> &Len =
163
InfoObj.EmitKeyDataLength(Out, I->Key, I->Data);
164
InfoObj.EmitKey(Out, I->Key, Len.first);
165
InfoObj.EmitData(Out, I->Key, I->Data, Len.second);
169
// Pad with zeros so that we can start the hashtable at an aligned address.
170
offset_type TableOff = Out.tell();
171
uint64_t N = llvm::OffsetToAlignment(TableOff, alignOf<offset_type>());
174
LE.write<uint8_t>(0);
176
// Emit the hashtable itself.
177
LE.write<offset_type>(NumBuckets);
178
LE.write<offset_type>(NumEntries);
179
for (offset_type I = 0; I < NumBuckets; ++I)
180
LE.write<offset_type>(Buckets[I].Off);
185
OnDiskChainedHashTableGenerator() {
188
// Note that we do not need to run the constructors of the individual
189
// Bucket objects since 'calloc' returns bytes that are all 0.
190
Buckets = (Bucket *)std::calloc(NumBuckets, sizeof(Bucket));
193
~OnDiskChainedHashTableGenerator() { std::free(Buckets); }
196
/// \brief Provides lookup on an on disk hash table.
198
/// This needs an \c Info that handles reading values from the hash table's
199
/// payload and computes the hash for a given key. This should provide the
200
/// following interface:
203
/// class ExampleLookupInfo {
205
/// typedef ExampleData data_type;
206
/// typedef ExampleInternalKey internal_key_type; // The stored key type.
207
/// typedef ExampleKey external_key_type; // The type to pass to find().
208
/// typedef uint32_t hash_value_type; // The type the hash function returns.
209
/// typedef uint32_t offset_type; // The type for offsets into the table.
211
/// /// Compare two keys for equality.
212
/// static bool EqualKey(internal_key_type &Key1, internal_key_type &Key2);
213
/// /// Calculate the hash for the given key.
214
/// static hash_value_type ComputeHash(internal_key_type &IKey);
215
/// /// Translate from the semantic type of a key in the hash table to the
216
/// /// type that is actually stored and used for hashing and comparisons.
217
/// /// The internal and external types are often the same, in which case this
218
/// /// can simply return the passed in value.
219
/// static const internal_key_type &GetInternalKey(external_key_type &EKey);
220
/// /// Read the key and data length from Buffer, leaving it pointing at the
221
/// /// following byte.
222
/// static std::pair<offset_type, offset_type>
223
/// ReadKeyDataLength(const unsigned char *&Buffer);
224
/// /// Read the key from Buffer, given the KeyLen as reported from
225
/// /// ReadKeyDataLength.
226
/// const internal_key_type &ReadKey(const unsigned char *Buffer,
227
/// offset_type KeyLen);
228
/// /// Read the data for Key from Buffer, given the DataLen as reported from
229
/// /// ReadKeyDataLength.
230
/// data_type ReadData(StringRef Key, const unsigned char *Buffer,
231
/// offset_type DataLen);
234
template <typename Info> class OnDiskChainedHashTable {
235
const typename Info::offset_type NumBuckets;
236
const typename Info::offset_type NumEntries;
237
const unsigned char *const Buckets;
238
const unsigned char *const Base;
242
typedef typename Info::internal_key_type internal_key_type;
243
typedef typename Info::external_key_type external_key_type;
244
typedef typename Info::data_type data_type;
245
typedef typename Info::hash_value_type hash_value_type;
246
typedef typename Info::offset_type offset_type;
248
OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
249
const unsigned char *Buckets,
250
const unsigned char *Base,
251
const Info &InfoObj = Info())
252
: NumBuckets(NumBuckets), NumEntries(NumEntries), Buckets(Buckets),
253
Base(Base), InfoObj(InfoObj) {
254
assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
255
"'buckets' must have a 4-byte alignment");
258
offset_type getNumBuckets() const { return NumBuckets; }
259
offset_type getNumEntries() const { return NumEntries; }
260
const unsigned char *getBase() const { return Base; }
261
const unsigned char *getBuckets() const { return Buckets; }
263
bool isEmpty() const { return NumEntries == 0; }
266
internal_key_type Key;
267
const unsigned char *const Data;
268
const offset_type Len;
272
iterator() : Data(nullptr), Len(0) {}
273
iterator(const internal_key_type K, const unsigned char *D, offset_type L,
275
: Key(K), Data(D), Len(L), InfoObj(InfoObj) {}
277
data_type operator*() const { return InfoObj->ReadData(Key, Data, Len); }
278
bool operator==(const iterator &X) const { return X.Data == Data; }
279
bool operator!=(const iterator &X) const { return X.Data != Data; }
282
/// \brief Look up the stored data for a particular key.
283
iterator find(const external_key_type &EKey, Info *InfoPtr = nullptr) {
284
const internal_key_type &IKey = InfoObj.GetInternalKey(EKey);
285
hash_value_type KeyHash = InfoObj.ComputeHash(IKey);
286
return find_hashed(IKey, KeyHash, InfoPtr);
289
/// \brief Look up the stored data for a particular key with a known hash.
290
iterator find_hashed(const internal_key_type &IKey, hash_value_type KeyHash,
291
Info *InfoPtr = nullptr) {
292
using namespace llvm::support;
297
// Each bucket is just an offset into the hash table file.
298
offset_type Idx = KeyHash & (NumBuckets - 1);
299
const unsigned char *Bucket = Buckets + sizeof(offset_type) * Idx;
301
offset_type Offset = endian::readNext<offset_type, little, aligned>(Bucket);
303
return iterator(); // Empty bucket.
304
const unsigned char *Items = Base + Offset;
306
// 'Items' starts with a 16-bit unsigned integer representing the
307
// number of items in this bucket.
308
unsigned Len = endian::readNext<uint16_t, little, unaligned>(Items);
310
for (unsigned i = 0; i < Len; ++i) {
312
hash_value_type ItemHash =
313
endian::readNext<hash_value_type, little, unaligned>(Items);
315
// Determine the length of the key and the data.
316
const std::pair<offset_type, offset_type> &L =
317
Info::ReadKeyDataLength(Items);
318
offset_type ItemLen = L.first + L.second;
320
// Compare the hashes. If they are not the same, skip the entry entirely.
321
if (ItemHash != KeyHash) {
327
const internal_key_type &X =
328
InfoPtr->ReadKey((const unsigned char *const)Items, L.first);
330
// If the key doesn't match just skip reading the value.
331
if (!InfoPtr->EqualKey(X, IKey)) {
337
return iterator(X, Items + L.first, L.second, InfoPtr);
343
iterator end() const { return iterator(); }
345
Info &getInfoObj() { return InfoObj; }
347
/// \brief Create the hash table.
349
/// \param Buckets is the beginning of the hash table itself, which follows
350
/// the payload of entire structure. This is the value returned by
351
/// OnDiskHashTableGenerator::Emit.
353
/// \param Base is the point from which all offsets into the structure are
354
/// based. This is offset 0 in the stream that was used when Emitting the
356
static OnDiskChainedHashTable *Create(const unsigned char *Buckets,
357
const unsigned char *const Base,
358
const Info &InfoObj = Info()) {
359
using namespace llvm::support;
360
assert(Buckets > Base);
361
assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
362
"buckets should be 4-byte aligned.");
364
offset_type NumBuckets =
365
endian::readNext<offset_type, little, aligned>(Buckets);
366
offset_type NumEntries =
367
endian::readNext<offset_type, little, aligned>(Buckets);
368
return new OnDiskChainedHashTable<Info>(NumBuckets, NumEntries, Buckets,
373
/// \brief Provides lookup and iteration over an on disk hash table.
375
/// \copydetails llvm::OnDiskChainedHashTable
376
template <typename Info>
377
class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
378
const unsigned char *Payload;
381
typedef OnDiskChainedHashTable<Info> base_type;
382
typedef typename base_type::internal_key_type internal_key_type;
383
typedef typename base_type::external_key_type external_key_type;
384
typedef typename base_type::data_type data_type;
385
typedef typename base_type::hash_value_type hash_value_type;
386
typedef typename base_type::offset_type offset_type;
388
OnDiskIterableChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
389
const unsigned char *Buckets,
390
const unsigned char *Payload,
391
const unsigned char *Base,
392
const Info &InfoObj = Info())
393
: base_type(NumBuckets, NumEntries, Buckets, Base, InfoObj),
396
/// \brief Iterates over all of the keys in the table.
398
const unsigned char *Ptr;
399
offset_type NumItemsInBucketLeft;
400
offset_type NumEntriesLeft;
404
typedef external_key_type value_type;
406
key_iterator(const unsigned char *const Ptr, offset_type NumEntries,
408
: Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
411
: Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0),
414
friend bool operator==(const key_iterator &X, const key_iterator &Y) {
415
return X.NumEntriesLeft == Y.NumEntriesLeft;
417
friend bool operator!=(const key_iterator &X, const key_iterator &Y) {
418
return X.NumEntriesLeft != Y.NumEntriesLeft;
421
key_iterator &operator++() { // Preincrement
422
using namespace llvm::support;
423
if (!NumItemsInBucketLeft) {
424
// 'Items' starts with a 16-bit unsigned integer representing the
425
// number of items in this bucket.
426
NumItemsInBucketLeft =
427
endian::readNext<uint16_t, little, unaligned>(Ptr);
429
Ptr += sizeof(hash_value_type); // Skip the hash.
430
// Determine the length of the key and the data.
431
const std::pair<offset_type, offset_type> &L =
432
Info::ReadKeyDataLength(Ptr);
433
Ptr += L.first + L.second;
434
assert(NumItemsInBucketLeft);
435
--NumItemsInBucketLeft;
436
assert(NumEntriesLeft);
440
key_iterator operator++(int) { // Postincrement
441
key_iterator tmp = *this; ++*this; return tmp;
444
value_type operator*() const {
445
const unsigned char *LocalPtr = Ptr;
446
if (!NumItemsInBucketLeft)
447
LocalPtr += 2; // number of items in bucket
448
LocalPtr += sizeof(hash_value_type); // Skip the hash.
450
// Determine the length of the key and the data.
451
const std::pair<offset_type, offset_type> &L =
452
Info::ReadKeyDataLength(LocalPtr);
455
const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
456
return InfoObj->GetExternalKey(Key);
460
key_iterator key_begin() {
461
return key_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
463
key_iterator key_end() { return key_iterator(); }
465
iterator_range<key_iterator> keys() {
466
return make_range(key_begin(), key_end());
469
/// \brief Iterates over all the entries in the table, returning the data.
470
class data_iterator {
471
const unsigned char *Ptr;
472
offset_type NumItemsInBucketLeft;
473
offset_type NumEntriesLeft;
477
typedef data_type value_type;
479
data_iterator(const unsigned char *const Ptr, offset_type NumEntries,
481
: Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
484
: Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0),
487
bool operator==(const data_iterator &X) const {
488
return X.NumEntriesLeft == NumEntriesLeft;
490
bool operator!=(const data_iterator &X) const {
491
return X.NumEntriesLeft != NumEntriesLeft;
494
data_iterator &operator++() { // Preincrement
495
using namespace llvm::support;
496
if (!NumItemsInBucketLeft) {
497
// 'Items' starts with a 16-bit unsigned integer representing the
498
// number of items in this bucket.
499
NumItemsInBucketLeft =
500
endian::readNext<uint16_t, little, unaligned>(Ptr);
502
Ptr += sizeof(hash_value_type); // Skip the hash.
503
// Determine the length of the key and the data.
504
const std::pair<offset_type, offset_type> &L =
505
Info::ReadKeyDataLength(Ptr);
506
Ptr += L.first + L.second;
507
assert(NumItemsInBucketLeft);
508
--NumItemsInBucketLeft;
509
assert(NumEntriesLeft);
513
data_iterator operator++(int) { // Postincrement
514
data_iterator tmp = *this; ++*this; return tmp;
517
value_type operator*() const {
518
const unsigned char *LocalPtr = Ptr;
519
if (!NumItemsInBucketLeft)
520
LocalPtr += 2; // number of items in bucket
521
LocalPtr += sizeof(hash_value_type); // Skip the hash.
523
// Determine the length of the key and the data.
524
const std::pair<offset_type, offset_type> &L =
525
Info::ReadKeyDataLength(LocalPtr);
528
const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
529
return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
533
data_iterator data_begin() {
534
return data_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
536
data_iterator data_end() { return data_iterator(); }
538
iterator_range<data_iterator> data() {
539
return make_range(data_begin(), data_end());
542
/// \brief Create the hash table.
544
/// \param Buckets is the beginning of the hash table itself, which follows
545
/// the payload of entire structure. This is the value returned by
546
/// OnDiskHashTableGenerator::Emit.
548
/// \param Payload is the beginning of the data contained in the table. This
549
/// is Base plus any padding or header data that was stored, ie, the offset
550
/// that the stream was at when calling Emit.
552
/// \param Base is the point from which all offsets into the structure are
553
/// based. This is offset 0 in the stream that was used when Emitting the
555
static OnDiskIterableChainedHashTable *
556
Create(const unsigned char *Buckets, const unsigned char *const Payload,
557
const unsigned char *const Base, const Info &InfoObj = Info()) {
558
using namespace llvm::support;
559
assert(Buckets > Base);
560
assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
561
"buckets should be 4-byte aligned.");
563
offset_type NumBuckets =
564
endian::readNext<offset_type, little, aligned>(Buckets);
565
offset_type NumEntries =
566
endian::readNext<offset_type, little, aligned>(Buckets);
567
return new OnDiskIterableChainedHashTable<Info>(
568
NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj);
572
} // end namespace llvm