1
/*-------------------------------------------------------------------------
4
* Implementation of Margo Seltzer's Hashing package for postgres.
6
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7
* Portions Copyright (c) 1994, Regents of the University of California
14
* This file contains only the public interface routines.
16
*-------------------------------------------------------------------------
21
#include "access/hash.h"
22
#include "access/relscan.h"
23
#include "catalog/index.h"
24
#include "commands/vacuum.h"
25
#include "miscadmin.h"
26
#include "optimizer/cost.h"
27
#include "optimizer/plancat.h"
28
#include "storage/bufmgr.h"
31
/* Working state for hashbuild and its callback */
34
HSpool *spool; /* NULL if not using spooling */
35
double indtuples; /* # tuples accepted into index */
38
static void hashbuildCallback(Relation index,
47
* hashbuild() -- build a new hash index.
50
hashbuild(PG_FUNCTION_ARGS)
52
Relation heap = (Relation) PG_GETARG_POINTER(0);
53
Relation index = (Relation) PG_GETARG_POINTER(1);
54
IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2);
55
IndexBuildResult *result;
59
HashBuildState buildstate;
62
* We expect to be called exactly once for any index relation. If that's
63
* not the case, big trouble's what we have.
65
if (RelationGetNumberOfBlocks(index) != 0)
66
elog(ERROR, "index \"%s\" already contains data",
67
RelationGetRelationName(index));
69
/* Estimate the number of rows currently present in the table */
70
estimate_rel_size(heap, NULL, &relpages, &reltuples);
72
/* Initialize the hash index metadata page and initial buckets */
73
num_buckets = _hash_metapinit(index, reltuples);
76
* If we just insert the tuples into the index in scan order, then
77
* (assuming their hash codes are pretty random) there will be no locality
78
* of access to the index, and if the index is bigger than available RAM
79
* then we'll thrash horribly. To prevent that scenario, we can sort the
80
* tuples by (expected) bucket number. However, such a sort is useless
81
* overhead when the index does fit in RAM. We choose to sort if the
82
* initial index size exceeds NBuffers.
84
* NOTE: this test will need adjustment if a bucket is ever different
87
if (num_buckets >= (uint32) NBuffers)
88
buildstate.spool = _h_spoolinit(index, num_buckets);
90
buildstate.spool = NULL;
92
/* prepare to build the index */
93
buildstate.indtuples = 0;
95
/* do the heap scan */
96
reltuples = IndexBuildHeapScan(heap, index, indexInfo, true,
97
hashbuildCallback, (void *) &buildstate);
101
/* sort the tuples and insert them into the index */
102
_h_indexbuild(buildstate.spool);
103
_h_spooldestroy(buildstate.spool);
109
result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
111
result->heap_tuples = reltuples;
112
result->index_tuples = buildstate.indtuples;
114
PG_RETURN_POINTER(result);
118
* Per-tuple callback from IndexBuildHeapScan
121
hashbuildCallback(Relation index,
128
HashBuildState *buildstate = (HashBuildState *) state;
131
/* form an index tuple and point it at the heap tuple */
132
itup = _hash_form_tuple(index, values, isnull);
133
itup->t_tid = htup->t_self;
135
/* Hash indexes don't index nulls, see notes in hashinsert */
136
if (IndexTupleHasNulls(itup))
142
/* Either spool the tuple for sorting, or just put it into the index */
143
if (buildstate->spool)
144
_h_spool(itup, buildstate->spool);
146
_hash_doinsert(index, itup);
148
buildstate->indtuples += 1;
154
* hashinsert() -- insert an index tuple into a hash table.
156
* Hash on the heap tuple's key, form an index tuple with hash code.
157
* Find the appropriate location for the new tuple, and put it there.
160
hashinsert(PG_FUNCTION_ARGS)
162
Relation rel = (Relation) PG_GETARG_POINTER(0);
163
Datum *values = (Datum *) PG_GETARG_POINTER(1);
164
bool *isnull = (bool *) PG_GETARG_POINTER(2);
165
ItemPointer ht_ctid = (ItemPointer) PG_GETARG_POINTER(3);
168
Relation heapRel = (Relation) PG_GETARG_POINTER(4);
169
bool checkUnique = PG_GETARG_BOOL(5);
173
/* generate an index tuple */
174
itup = _hash_form_tuple(rel, values, isnull);
175
itup->t_tid = *ht_ctid;
178
* If the single index key is null, we don't insert it into the index.
179
* Hash tables support scans on '='. Relational algebra says that A = B
180
* returns null if either A or B is null. This means that no
181
* qualification used in an index scan could ever return true on a null
182
* attribute. It also means that indices can't be used by ISNULL or
183
* NOTNULL scans, but that's an artifact of the strategy map architecture
184
* chosen in 1986, not of the way nulls are handled here.
186
if (IndexTupleHasNulls(itup))
189
PG_RETURN_BOOL(false);
192
_hash_doinsert(rel, itup);
196
PG_RETURN_BOOL(true);
201
* hashgettuple() -- Get the next tuple in the scan.
204
hashgettuple(PG_FUNCTION_ARGS)
206
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
207
ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
208
HashScanOpaque so = (HashScanOpaque) scan->opaque;
209
Relation rel = scan->indexRelation;
214
/* Hash indexes are always lossy since we store only the hash code */
215
scan->xs_recheck = true;
218
* We hold pin but not lock on current buffer while outside the hash AM.
219
* Reacquire the read lock here.
221
if (BufferIsValid(so->hashso_curbuf))
222
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
225
* If we've already initialized this scan, we can just advance it in the
226
* appropriate direction. If we haven't done so yet, we call a routine to
227
* get the first item in the scan.
229
if (ItemPointerIsValid(&(so->hashso_curpos)))
232
* Check to see if we should kill the previously-fetched tuple.
234
if (scan->kill_prior_tuple)
237
* Yes, so mark it by setting the LP_DEAD state in the item flags.
239
offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
240
page = BufferGetPage(so->hashso_curbuf);
241
ItemIdMarkDead(PageGetItemId(page, offnum));
244
* Since this can be redone later if needed, it's treated the same
245
* as a commit-hint-bit status update for heap tuples: we mark the
246
* buffer dirty but don't make a WAL log entry.
248
SetBufferCommitInfoNeedsSave(so->hashso_curbuf);
252
* Now continue the scan.
254
res = _hash_next(scan, dir);
257
res = _hash_first(scan, dir);
260
* Skip killed tuples if asked to.
262
if (scan->ignore_killed_tuples)
266
offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
267
page = BufferGetPage(so->hashso_curbuf);
268
if (!ItemIdIsDead(PageGetItemId(page, offnum)))
270
res = _hash_next(scan, dir);
274
/* Release read lock on current buffer, but keep it pinned */
275
if (BufferIsValid(so->hashso_curbuf))
276
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);
283
* hashgetbitmap() -- get all tuples at once
286
hashgetbitmap(PG_FUNCTION_ARGS)
288
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
289
TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
290
HashScanOpaque so = (HashScanOpaque) scan->opaque;
294
res = _hash_first(scan, ForwardScanDirection);
300
CHECK_FOR_INTERRUPTS();
303
* Skip killed tuples if asked to.
305
if (scan->ignore_killed_tuples)
310
offnum = ItemPointerGetOffsetNumber(&(so->hashso_curpos));
311
page = BufferGetPage(so->hashso_curbuf);
312
add_tuple = !ItemIdIsDead(PageGetItemId(page, offnum));
317
/* Save tuple ID, and continue scanning */
320
/* Note we mark the tuple ID as requiring recheck */
321
tbm_add_tuples(tbm, &scan->xs_ctup.t_self, 1, true);
325
res = _hash_next(scan, ForwardScanDirection);
328
PG_RETURN_INT64(ntids);
333
* hashbeginscan() -- start a scan on a hash index
336
hashbeginscan(PG_FUNCTION_ARGS)
338
Relation rel = (Relation) PG_GETARG_POINTER(0);
339
int keysz = PG_GETARG_INT32(1);
340
ScanKey scankey = (ScanKey) PG_GETARG_POINTER(2);
344
scan = RelationGetIndexScan(rel, keysz, scankey);
345
so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
346
so->hashso_bucket_valid = false;
347
so->hashso_bucket_blkno = 0;
348
so->hashso_curbuf = InvalidBuffer;
349
/* set position invalid (this will cause _hash_first call) */
350
ItemPointerSetInvalid(&(so->hashso_curpos));
354
/* register scan in case we change pages it's using */
357
PG_RETURN_POINTER(scan);
361
* hashrescan() -- rescan an index relation
364
hashrescan(PG_FUNCTION_ARGS)
366
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
367
ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
368
HashScanOpaque so = (HashScanOpaque) scan->opaque;
369
Relation rel = scan->indexRelation;
371
/* if we are called from beginscan, so is still NULL */
374
/* release any pin we still hold */
375
if (BufferIsValid(so->hashso_curbuf))
376
_hash_dropbuf(rel, so->hashso_curbuf);
377
so->hashso_curbuf = InvalidBuffer;
379
/* release lock on bucket, too */
380
if (so->hashso_bucket_blkno)
381
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
382
so->hashso_bucket_blkno = 0;
384
/* set position invalid (this will cause _hash_first call) */
385
ItemPointerSetInvalid(&(so->hashso_curpos));
388
/* Update scan key, if a new one is given */
389
if (scankey && scan->numberOfKeys > 0)
391
memmove(scan->keyData,
393
scan->numberOfKeys * sizeof(ScanKeyData));
395
so->hashso_bucket_valid = false;
402
* hashendscan() -- close down a scan
405
hashendscan(PG_FUNCTION_ARGS)
407
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
408
HashScanOpaque so = (HashScanOpaque) scan->opaque;
409
Relation rel = scan->indexRelation;
411
/* don't need scan registered anymore */
412
_hash_dropscan(scan);
414
/* release any pin we still hold */
415
if (BufferIsValid(so->hashso_curbuf))
416
_hash_dropbuf(rel, so->hashso_curbuf);
417
so->hashso_curbuf = InvalidBuffer;
419
/* release lock on bucket, too */
420
if (so->hashso_bucket_blkno)
421
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
422
so->hashso_bucket_blkno = 0;
431
* hashmarkpos() -- save current scan position
434
hashmarkpos(PG_FUNCTION_ARGS)
436
elog(ERROR, "hash does not support mark/restore");
441
* hashrestrpos() -- restore scan to last saved position
444
hashrestrpos(PG_FUNCTION_ARGS)
446
elog(ERROR, "hash does not support mark/restore");
451
* Bulk deletion of all index entries pointing to a set of heap tuples.
452
* The set of target tuples is specified via a callback routine that tells
453
* whether any given heap tuple (identified by ItemPointer) is being deleted.
455
* Result: a palloc'd struct containing statistical info for VACUUM displays.
458
hashbulkdelete(PG_FUNCTION_ARGS)
460
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
461
IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
462
IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(2);
463
void *callback_state = (void *) PG_GETARG_POINTER(3);
464
Relation rel = info->index;
465
double tuples_removed;
466
double num_index_tuples;
468
Bucket orig_maxbucket;
469
Bucket cur_maxbucket;
473
HashMetaPageData local_metapage;
476
num_index_tuples = 0;
479
* Read the metapage to fetch original bucket and tuple counts. Also, we
480
* keep a copy of the last-seen metapage so that we can use its
481
* hashm_spares[] values to compute bucket page addresses. This is a bit
482
* hokey but perfectly safe, since the interesting entries in the spares
483
* array cannot change under us; and it beats rereading the metapage for
486
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
487
metap = HashPageGetMeta(BufferGetPage(metabuf));
488
orig_maxbucket = metap->hashm_maxbucket;
489
orig_ntuples = metap->hashm_ntuples;
490
memcpy(&local_metapage, metap, sizeof(local_metapage));
491
_hash_relbuf(rel, metabuf);
493
/* Scan the buckets that we know exist */
495
cur_maxbucket = orig_maxbucket;
498
while (cur_bucket <= cur_maxbucket)
500
BlockNumber bucket_blkno;
502
bool bucket_dirty = false;
504
/* Get address of bucket's start page */
505
bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
507
/* Exclusive-lock the bucket so we can shrink it */
508
_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);
510
/* Shouldn't have any active scans locally, either */
511
if (_hash_has_active_scan(rel, cur_bucket))
512
elog(ERROR, "hash index has active scan during VACUUM");
514
/* Scan each page in bucket */
515
blkno = bucket_blkno;
516
while (BlockNumberIsValid(blkno))
520
HashPageOpaque opaque;
522
OffsetNumber maxoffno;
523
bool page_dirty = false;
525
vacuum_delay_point();
527
buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
528
LH_BUCKET_PAGE | LH_OVERFLOW_PAGE,
530
page = BufferGetPage(buf);
531
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
532
Assert(opaque->hasho_bucket == cur_bucket);
534
/* Scan each tuple in page */
535
offno = FirstOffsetNumber;
536
maxoffno = PageGetMaxOffsetNumber(page);
537
while (offno <= maxoffno)
542
itup = (IndexTuple) PageGetItem(page,
543
PageGetItemId(page, offno));
544
htup = &(itup->t_tid);
545
if (callback(htup, callback_state))
547
/* delete the item from the page */
548
PageIndexTupleDelete(page, offno);
549
bucket_dirty = page_dirty = true;
551
/* don't increment offno, instead decrement maxoffno */
552
maxoffno = OffsetNumberPrev(maxoffno);
558
offno = OffsetNumberNext(offno);
560
num_index_tuples += 1;
565
* Write page if needed, advance to next page.
567
blkno = opaque->hasho_nextblkno;
570
_hash_wrtbuf(rel, buf);
572
_hash_relbuf(rel, buf);
575
/* If we deleted anything, try to compact free space */
577
_hash_squeezebucket(rel, cur_bucket, bucket_blkno,
580
/* Release bucket lock */
581
_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);
583
/* Advance to next bucket */
587
/* Write-lock metapage and check for split since we started */
588
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE, LH_META_PAGE);
589
metap = HashPageGetMeta(BufferGetPage(metabuf));
591
if (cur_maxbucket != metap->hashm_maxbucket)
593
/* There's been a split, so process the additional bucket(s) */
594
cur_maxbucket = metap->hashm_maxbucket;
595
memcpy(&local_metapage, metap, sizeof(local_metapage));
596
_hash_relbuf(rel, metabuf);
600
/* Okay, we're really done. Update tuple count in metapage. */
602
if (orig_maxbucket == metap->hashm_maxbucket &&
603
orig_ntuples == metap->hashm_ntuples)
606
* No one has split or inserted anything since start of scan, so
607
* believe our count as gospel.
609
metap->hashm_ntuples = num_index_tuples;
614
* Otherwise, our count is untrustworthy since we may have
615
* double-scanned tuples in split buckets. Proceed by dead-reckoning.
617
if (metap->hashm_ntuples > tuples_removed)
618
metap->hashm_ntuples -= tuples_removed;
620
metap->hashm_ntuples = 0;
621
num_index_tuples = metap->hashm_ntuples;
624
_hash_wrtbuf(rel, metabuf);
626
/* return statistics */
628
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
629
stats->num_index_tuples = num_index_tuples;
630
stats->tuples_removed += tuples_removed;
631
/* hashvacuumcleanup will fill in num_pages */
633
PG_RETURN_POINTER(stats);
637
* Post-VACUUM cleanup.
639
* Result: a palloc'd struct containing statistical info for VACUUM displays.
642
hashvacuumcleanup(PG_FUNCTION_ARGS)
644
IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0);
645
IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1);
646
Relation rel = info->index;
647
BlockNumber num_pages;
649
/* If hashbulkdelete wasn't called, return NULL signifying no change */
650
/* Note: this covers the analyze_only case too */
652
PG_RETURN_POINTER(NULL);
654
/* update statistics */
655
num_pages = RelationGetNumberOfBlocks(rel);
656
stats->num_pages = num_pages;
658
PG_RETURN_POINTER(stats);
663
hash_redo(XLogRecPtr lsn, XLogRecord *record)
665
elog(PANIC, "hash_redo: unimplemented");
669
hash_desc(StringInfo buf, uint8 xl_info, char *rec)