1
/*-------------------------------------------------------------------------
4
* Implementation of Margo Seltzer's Hashing package for postgres.
6
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
7
* Portions Copyright (c) 1994, Regents of the University of California
11
* src/backend/access/hash/hash.c
14
* This file contains only the public interface routines.
16
*-------------------------------------------------------------------------
21
#include "access/hash.h"
22
#include "access/relscan.h"
23
#include "catalog/index.h"
24
#include "commands/vacuum.h"
25
#include "optimizer/cost.h"
26
#include "optimizer/plancat.h"
27
#include "storage/bufmgr.h"
30
/* Working state for hashbuild and its callback */
33
HSpool *spool; /* NULL if not using spooling */
34
double indtuples; /* # tuples accepted into index */
37
static void hashbuildCallback(Relation index,
46
* hashbuild() -- build a new hash index.
49
hashbuild(PG_FUNCTION_ARGS)
51
Relation heap = (Relation) PG_GETARG_POINTER(0);
52
Relation index = (Relation) PG_GETARG_POINTER(1);
53
IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
54
IndexBuildResult *result;
58
HashBuildState buildstate;
61
* We expect to be called exactly once for any index relation. If that's
62
* not the case, big trouble's what we have.
64
if (RelationGetNumberOfBlocks(index) != 0)
65
elog(ERROR, "index \"%s\" already contains data",
66
RelationGetRelationName(index));
68
/* Estimate the number of rows currently present in the table */
69
estimate_rel_size(heap, NULL, &relpages, &reltuples);
71
/* Initialize the hash index metadata page and initial buckets */
72
num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM);
75
* If we just insert the tuples into the index in scan order, then
76
* (assuming their hash codes are pretty random) there will be no locality
77
* of access to the index, and if the index is bigger than available RAM
78
* then we'll thrash horribly. To prevent that scenario, we can sort the
79
* tuples by (expected) bucket number. However, such a sort is useless
80
* overhead when the index does fit in RAM. We choose to sort if the
81
* initial index size exceeds NBuffers.
83
* NOTE: this test will need adjustment if a bucket is ever different from
86
if (num_buckets >= (uint32) NBuffers)
87
buildstate.spool = _h_spoolinit(index, num_buckets);
89
buildstate.spool = NULL;
91
/* prepare to build the index */
92
buildstate.indtuples = 0;
94
/* do the heap scan */
95
reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
96
hashbuildCallback, (void *) &buildstate);
100
/* sort the tuples and insert them into the index */
101
_h_indexbuild(buildstate.spool);
102
_h_spooldestroy(buildstate.spool);
108
result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
110
result->heap_tuples = reltuples;
111
result->index_tuples = buildstate.indtuples;
113
PG_RETURN_POINTER(result);
117
* hashbuildempty() -- build an empty hash index in the initialization fork
120
hashbuildempty(PG_FUNCTION_ARGS)
122
Relation index = (Relation) PG_GETARG_POINTER(0);
124
_hash_metapinit(index, 0, INIT_FORKNUM);
130
* Per-tuple callback from IndexBuildHeapScan
133
hashbuildCallback(Relation index,
140
HashBuildState *buildstate = (HashBuildState *) state;
143
/* form an index tuple and point it at the heap tuple */
144
itup = _hash_form_tuple(index, values, isnull);
145
itup->t_tid = htup->t_self;
147
/* Hash indexes don't index nulls, see notes in hashinsert */
148
if (IndexTupleHasNulls(itup))
154
/* Either spool the tuple for sorting, or just put it into the index */
155
if (buildstate->spool)
156
_h_spool(itup, buildstate->spool);
158
_hash_doinsert(index, itup);
160
buildstate->indtuples += 1;
166
* hashinsert() -- insert an index tuple into a hash table.
168
* Hash on the heap tuple's key, form an index tuple with hash code.
169
* Find the appropriate location for the new tuple, and put it there.
172
hashinsert(PG_FUNCTION_ARGS)
174
Relation rel = (Relation) PG_GETARG_POINTER(0);
175
Datum *values = (Datum *) PG_GETARG_POINTER(1);
176
bool *isnull = (bool *) PG_GETARG_POINTER(2);
177
ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
180
Relation heapRel = (Relation) PG_GETARG_POINTER(4);
181
IndexUniqueCheck checkUnique = (IndexUniqueCheck) PG_GETARG_INT32(5);
185
/* generate an index tuple */
186
itup = _hash_form_tuple(rel, values, isnull);
187
itup->t_tid = *ht_ctid;
190
* If the single index key is null, we don't insert it into the index.
191
* Hash tables support scans on '='. Relational algebra says that A = B
192
* returns null if either A or B is null. This means that no
193
* qualification used in an index scan could ever return true on a null
194
* attribute. It also means that indices can't be used by ISNULL or
195
* NOTNULL scans, but that's an artifact of the strategy map architecture
196
* chosen in 1986, not of the way nulls are handled here.
198
if (IndexTupleHasNulls(itup))
201
PG_RETURN_BOOL(false);
204
_hash_doinsert(rel, itup);
208
PG_RETURN_BOOL(false);
213
* hashgettuple() -- Get the next tuple in the scan.
216
hashgettuple(PG_FUNCTION_ARGS)
218
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
219
ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
220
HashScanOpaque so = (HashScanOpaque) scan->opaque;
221
Relation rel = scan->indexRelation;
228
/* Hash indexes are always lossy since we store only the hash code */
229
scan->xs_recheck = true;
232
* We hold pin but not lock on current buffer while outside the hash AM.
233
* Reacquire the read lock here.
235
if (BufferIsValid(so->hashso_curbuf))
236
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
239
* If we've already initialized this scan, we can just advance it in the
240
* appropriate direction. If we haven't done so yet, we call a routine to
241
* get the first item in the scan.
243
current = &(so->hashso_curpos);
244
if (ItemPointerIsValid(current))
247
* An insertion into the current index page could have happened while
248
* we didn't have read lock on it. Re-find our position by looking
249
* for the TID we previously returned. (Because we hold share lock on
250
* the bucket, no deletions or splits could have occurred; therefore
251
* we can expect that the TID still exists in the current index page,
252
* at an offset >= where we were.)
254
OffsetNumber maxoffnum;
256
buf = so->hashso_curbuf;
257
Assert(BufferIsValid(buf));
258
page = BufferGetPage(buf);
259
maxoffnum = PageGetMaxOffsetNumber(page);
260
for (offnum = ItemPointerGetOffsetNumber(current);
262
offnum = OffsetNumberNext(offnum))
266
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
267
if (ItemPointerEquals(&(so->hashso_heappos), &(itup->t_tid)))
270
if (offnum > maxoffnum)
271
elog(ERROR, "failed to re-find scan position within index \"%s\"",
272
RelationGetRelationName(rel));
273
ItemPointerSetOffsetNumber(current, offnum);
276
* Check to see if we should kill the previously-fetched tuple.
278
if (scan->kill_prior_tuple)
281
* Yes, so mark it by setting the LP_DEAD state in the item flags.
283
ItemIdMarkDead(PageGetItemId(page, offnum));
286
* Since this can be redone later if needed, it's treated the same
287
* as a commit-hint-bit status update for heap tuples: we mark the
288
* buffer dirty but don't make a WAL log entry.
290
SetBufferCommitInfoNeedsSave(buf);
294
* Now continue the scan.
296
res = _hash_next(scan, dir);
299
res = _hash_first(scan, dir);
302
* Skip killed tuples if asked to.
304
if (scan->ignore_killed_tuples)
308
offnum = ItemPointerGetOffsetNumber(current);
309
page = BufferGetPage(so->hashso_curbuf);
310
if (!ItemIdIsDead(PageGetItemId(page, offnum)))
312
res = _hash_next(scan, dir);
316
/* Release read lock on current buffer, but keep it pinned */
317
if (BufferIsValid(so->hashso_curbuf))
318
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);
320
/* Return current heap TID on success */
321
scan->xs_ctup.t_self = so->hashso_heappos;
328
* hashgetbitmap() -- get all tuples at once
331
hashgetbitmap(PG_FUNCTION_ARGS)
333
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
334
TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
335
HashScanOpaque so = (HashScanOpaque) scan->opaque;
339
res = _hash_first(scan, ForwardScanDirection);
346
* Skip killed tuples if asked to.
348
if (scan->ignore_killed_tuples)
353
offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
354
page = BufferGetPage(so->hashso_curbuf);
355
add_tuple = !ItemIdIsDead(PageGetItemId(page, offnum));
360
/* Save tuple ID, and continue scanning */
363
/* Note we mark the tuple ID as requiring recheck */
364
tbm_add_tuples(tbm, &(so->hashso_heappos), 1, true);
368
res = _hash_next(scan, ForwardScanDirection);
371
PG_RETURN_INT64(ntids);
376
* hashbeginscan() -- start a scan on a hash index
379
hashbeginscan(PG_FUNCTION_ARGS)
381
Relation rel = (Relation) PG_GETARG_POINTER(0);
382
int nkeys = PG_GETARG_INT32(1);
383
int norderbys = PG_GETARG_INT32(2);
387
/* no order by operators allowed */
388
Assert(norderbys == 0);
390
scan = RelationGetIndexScan(rel, nkeys, norderbys);
392
so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
393
so->hashso_bucket_valid = false;
394
so->hashso_bucket_blkno = 0;
395
so->hashso_curbuf = InvalidBuffer;
396
/* set position invalid (this will cause _hash_first call) */
397
ItemPointerSetInvalid(&(so->hashso_curpos));
398
ItemPointerSetInvalid(&(so->hashso_heappos));
402
/* register scan in case we change pages it's using */
405
PG_RETURN_POINTER(scan);
409
* hashrescan() -- rescan an index relation
412
hashrescan(PG_FUNCTION_ARGS)
414
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
415
ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
417
/* remaining arguments are ignored */
418
HashScanOpaque so = (HashScanOpaque) scan->opaque;
419
Relation rel = scan->indexRelation;
421
/* release any pin we still hold */
422
if (BufferIsValid(so->hashso_curbuf))
423
_hash_dropbuf(rel, so->hashso_curbuf);
424
so->hashso_curbuf = InvalidBuffer;
426
/* release lock on bucket, too */
427
if (so->hashso_bucket_blkno)
428
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
429
so->hashso_bucket_blkno = 0;
431
/* set position invalid (this will cause _hash_first call) */
432
ItemPointerSetInvalid(&(so->hashso_curpos));
433
ItemPointerSetInvalid(&(so->hashso_heappos));
435
/* Update scan key, if a new one is given */
436
if (scankey && scan->numberOfKeys > 0)
438
memmove(scan->keyData,
440
scan->numberOfKeys * sizeof(ScanKeyData));
441
so->hashso_bucket_valid = false;
448
* hashendscan() -- close down a scan
451
hashendscan(PG_FUNCTION_ARGS)
453
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
454
HashScanOpaque so = (HashScanOpaque) scan->opaque;
455
Relation rel = scan->indexRelation;
457
/* don't need scan registered anymore */
458
_hash_dropscan(scan);
460
/* release any pin we still hold */
461
if (BufferIsValid(so->hashso_curbuf))
462
_hash_dropbuf(rel, so->hashso_curbuf);
463
so->hashso_curbuf = InvalidBuffer;
465
/* release lock on bucket, too */
466
if (so->hashso_bucket_blkno)
467
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
468
so->hashso_bucket_blkno = 0;
477
* hashmarkpos() -- save current scan position
480
hashmarkpos(PG_FUNCTION_ARGS)
482
elog(ERROR, "hash does not support mark/restore");
487
* hashrestrpos() -- restore scan to last saved position
490
hashrestrpos(PG_FUNCTION_ARGS)
492
elog(ERROR, "hash does not support mark/restore");
497
* Bulk deletion of all index entries pointing to a set of heap tuples.
498
* The set of target tuples is specified via a callback routine that tells
499
* whether any given heap tuple (identified by ItemPointer) is being deleted.
501
* Result: a palloc'd struct containing statistical info for VACUUM displays.
504
hashbulkdelete(PG_FUNCTION_ARGS)
506
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
507
IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
508
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
509
void *callback_state = (void *) PG_GETARG_POINTER(3);
510
Relation rel = info->index;
511
double tuples_removed;
512
double num_index_tuples;
514
Bucket orig_maxbucket;
515
Bucket cur_maxbucket;
519
HashMetaPageData local_metapage;
522
num_index_tuples = 0;
525
* Read the metapage to fetch original bucket and tuple counts. Also, we
526
* keep a copy of the last-seen metapage so that we can use its
527
* hashm_spares[] values to compute bucket page addresses. This is a bit
528
* hokey but perfectly safe, since the interesting entries in the spares
529
* array cannot change under us; and it beats rereading the metapage for
532
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
533
metap = HashPageGetMeta(BufferGetPage(metabuf));
534
orig_maxbucket = metap->hashm_maxbucket;
535
orig_ntuples = metap->hashm_ntuples;
536
memcpy(&local_metapage, metap, sizeof(local_metapage));
537
_hash_relbuf(rel, metabuf);
539
/* Scan the buckets that we know exist */
541
cur_maxbucket = orig_maxbucket;
544
while (cur_bucket <= cur_maxbucket)
546
BlockNumber bucket_blkno;
548
bool bucket_dirty = false;
550
/* Get address of bucket's start page */
551
bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
553
/* Exclusive-lock the bucket so we can shrink it */
554
_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);
556
/* Shouldn't have any active scans locally, either */
557
if (_hash_has_active_scan(rel, cur_bucket))
558
elog(ERROR, "hash index has active scan during VACUUM");
560
/* Scan each page in bucket */
561
blkno = bucket_blkno;
562
while (BlockNumberIsValid(blkno))
566
HashPageOpaque opaque;
568
OffsetNumber maxoffno;
569
OffsetNumber deletable[MaxOffsetNumber];
572
vacuum_delay_point();
574
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
575
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
577
page = BufferGetPage(buf);
578
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
579
Assert(opaque->hasho_bucket == cur_bucket);
581
/* Scan each tuple in page */
582
maxoffno = PageGetMaxOffsetNumber(page);
583
for (offno = FirstOffsetNumber;
585
offno = OffsetNumberNext(offno))
590
itup = (IndexTuple) PageGetItem(page,
591
PageGetItemId(page, offno));
592
htup = &(itup->t_tid);
593
if (callback(htup, callback_state))
595
/* mark the item for deletion */
596
deletable[ndeletable++] = offno;
600
num_index_tuples += 1;
604
* Apply deletions and write page if needed, advance to next page.
606
blkno = opaque->hasho_nextblkno;
610
PageIndexMultiDelete(page, deletable, ndeletable);
611
_hash_wrtbuf(rel, buf);
615
_hash_relbuf(rel, buf);
618
/* If we deleted anything, try to compact free space */
620
_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
623
/* Release bucket lock */
624
_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);
626
/* Advance to next bucket */
630
/* Write-lock metapage and check for split since we started */
631
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
632
metap = HashPageGetMeta(BufferGetPage(metabuf));
634
if (cur_maxbucket != metap->hashm_maxbucket)
636
/* There's been a split, so process the additional bucket(s) */
637
cur_maxbucket = metap->hashm_maxbucket;
638
memcpy(&local_metapage, metap, sizeof(local_metapage));
639
_hash_relbuf(rel, metabuf);
643
/* Okay, we're really done. Update tuple count in metapage. */
645
if (orig_maxbucket == metap->hashm_maxbucket &&
646
orig_ntuples == metap->hashm_ntuples)
649
* No one has split or inserted anything since start of scan, so
650
* believe our count as gospel.
652
metap->hashm_ntuples = num_index_tuples;
657
* Otherwise, our count is untrustworthy since we may have
658
* double-scanned tuples in split buckets. Proceed by dead-reckoning.
659
* (Note: we still return estimated_count = false, because using this
660
* count is better than not updating reltuples at all.)
662
if (metap->hashm_ntuples > tuples_removed)
663
metap->hashm_ntuples -= tuples_removed;
665
metap->hashm_ntuples = 0;
666
num_index_tuples = metap->hashm_ntuples;
669
_hash_wrtbuf(rel, metabuf);
671
/* return statistics */
673
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
674
stats->estimated_count = false;
675
stats->num_index_tuples = num_index_tuples;
676
stats->tuples_removed += tuples_removed;
677
/* hashvacuumcleanup will fill in num_pages */
679
PG_RETURN_POINTER(stats);
683
* Post-VACUUM cleanup.
685
* Result: a palloc'd struct containing statistical info for VACUUM displays.
688
hashvacuumcleanup(PG_FUNCTION_ARGS)
690
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
691
IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
692
Relation rel = info->index;
693
BlockNumber num_pages;
695
/* If hashbulkdelete wasn't called, return NULL signifying no change */
696
/* Note: this covers the analyze_only case too */
698
PG_RETURN_POINTER(NULL);
700
/* update statistics */
701
num_pages = RelationGetNumberOfBlocks(rel);
702
stats->num_pages = num_pages;
704
PG_RETURN_POINTER(stats);
709
hash_redo(XLogRecPtr lsn, XLogRecord *record)
711
elog(PANIC, "hash_redo: unimplemented");
715
hash_desc(StringInfo buf, uint8 xl_info, char *rec)