1
/*-------------------------------------------------------------------------
4
* Hash table page management code for the Postgres hash access method
6
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
7
* Portions Copyright (c) 1994, Regents of the University of California
11
* $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.47 2004-12-31 21:59:13 pgsql Exp $
14
* Postgres hash pages look like ordinary relation pages. The opaque
15
* data at high addresses includes information about the page including
16
* whether a page is an overflow page or a true bucket, the bucket
17
* number, and the block numbers of the preceding and following pages
20
* The first page in a hash relation, page zero, is special -- it stores
21
* information describing the hash table; it is referred to as the
22
* "meta page." Pages one and higher store the actual data.
24
* There are also bitmap pages, which are not manipulated here;
27
*-------------------------------------------------------------------------
31
#include "access/genam.h"
32
#include "access/hash.h"
33
#include "storage/lmgr.h"
34
#include "utils/lsyscache.h"
37
static void _hash_splitbucket(Relation rel, Buffer metabuf,
38
Bucket obucket, Bucket nbucket,
39
BlockNumber start_oblkno,
40
BlockNumber start_nblkno,
42
uint32 highmask, uint32 lowmask);
46
* We use high-concurrency locking on hash indexes (see README for an overview
47
* of the locking rules). However, we can skip taking lmgr locks when the
48
* index is local to the current backend (ie, either temp or new in the
49
* current transaction). No one else can see it, so there's no reason to
50
* take locks. We still take buffer-level locks, but not lmgr locks.
52
#define USELOCKING(rel) (!RELATION_IS_LOCAL(rel))
56
* _hash_getlock() -- Acquire an lmgr lock.
58
* 'whichlock' should be zero to acquire the split-control lock, or the
59
* block number of a bucket's primary bucket page to acquire the per-bucket
60
* lock. (See README for details of the use of these locks.)
62
* 'access' must be HASH_SHARE or HASH_EXCLUSIVE.
65
_hash_getlock(Relation rel, BlockNumber whichlock, int access)
68
LockPage(rel, whichlock, access);
72
* _hash_try_getlock() -- Acquire an lmgr lock, but only if it's free.
74
* Same as above except we return FALSE without blocking if lock isn't free.
77
_hash_try_getlock(Relation rel, BlockNumber whichlock, int access)
80
return ConditionalLockPage(rel, whichlock, access);
86
* _hash_droplock() -- Release an lmgr lock.
89
_hash_droplock(Relation rel, BlockNumber whichlock, int access)
92
UnlockPage(rel, whichlock, access);
96
* _hash_getbuf() -- Get a buffer by block number for read or write.
98
* 'access' must be HASH_READ, HASH_WRITE, or HASH_NOLOCK.
100
* When this routine returns, the appropriate lock is set on the
101
* requested buffer and its reference count has been incremented
102
* (ie, the buffer is "locked and pinned").
104
* XXX P_NEW is not used because, unlike the tree structures, we
105
* need the bucket blocks to be at certain block numbers. we must
106
* depend on the caller to call _hash_pageinit on the block if it
107
* knows that this is a new block.
110
_hash_getbuf(Relation rel, BlockNumber blkno, int access)
115
elog(ERROR, "hash AM does not use P_NEW");
117
buf = ReadBuffer(rel, blkno);
119
if (access != HASH_NOLOCK)
120
LockBuffer(buf, access);
122
/* ref count and lock type are correct */
127
* _hash_relbuf() -- release a locked buffer.
129
* Lock and pin (refcount) are both dropped. Note that either read or
130
* write lock can be dropped this way, but if we modified the buffer,
131
* this is NOT the right way to release a write lock.
134
_hash_relbuf(Relation rel, Buffer buf)
136
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
141
* _hash_dropbuf() -- release an unlocked buffer.
143
* This is used to unpin a buffer on which we hold no lock. It is assumed
144
* that the buffer is not dirty.
147
_hash_dropbuf(Relation rel, Buffer buf)
153
* _hash_wrtbuf() -- write a hash page to disk.
155
* This routine releases the lock held on the buffer and our refcount
156
* for it. It is an error to call _hash_wrtbuf() without a write lock
157
* and a pin on the buffer.
159
* NOTE: actually, the buffer manager just marks the shared buffer page
160
* dirty here; the real I/O happens later. This is okay since we are not
161
* relying on write ordering anyway. The WAL mechanism is responsible for
162
* guaranteeing correctness after a crash.
165
_hash_wrtbuf(Relation rel, Buffer buf)
167
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
172
* _hash_wrtnorelbuf() -- write a hash page to disk, but do not release
173
* our reference or lock.
175
* It is an error to call _hash_wrtnorelbuf() without a write lock
176
* and a pin on the buffer.
181
_hash_wrtnorelbuf(Relation rel, Buffer buf)
183
WriteNoReleaseBuffer(buf);
187
* _hash_chgbufaccess() -- Change the lock type on a buffer, without
188
* dropping our pin on it.
190
* from_access and to_access may be HASH_READ, HASH_WRITE, or HASH_NOLOCK,
191
* the last indicating that no buffer-level lock is held or wanted.
193
* When from_access == HASH_WRITE, we assume the buffer is dirty and tell
194
* bufmgr it must be written out. If the caller wants to release a write
195
* lock on a page that's not been modified, it's okay to pass from_access
196
* as HASH_READ (a bit ugly, but handy in some places).
199
_hash_chgbufaccess(Relation rel,
204
if (from_access != HASH_NOLOCK)
205
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
206
if (from_access == HASH_WRITE)
207
WriteNoReleaseBuffer(buf);
209
if (to_access != HASH_NOLOCK)
210
LockBuffer(buf, to_access);
215
* _hash_metapinit() -- Initialize the metadata page of a hash index,
216
* the two buckets that we begin with and the initial
219
* We are fairly cavalier about locking here, since we know that no one else
220
* could be accessing this index. In particular the rule about not holding
221
* multiple buffer locks is ignored.
224
_hash_metapinit(Relation rel)
227
HashPageOpaque pageopaque;
237
if (RelationGetNumberOfBlocks(rel) != 0)
238
elog(ERROR, "cannot initialize non-empty hash index \"%s\"",
239
RelationGetRelationName(rel));
242
* Determine the target fill factor (tuples per bucket) for this
243
* index. The idea is to make the fill factor correspond to pages
244
* about 3/4ths full. We can compute it exactly if the index datatype
245
* is fixed-width, but for var-width there's some guessing involved.
247
data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid,
248
RelationGetDescr(rel)->attrs[0]->atttypmod);
249
item_width = MAXALIGN(sizeof(HashItemData)) + MAXALIGN(data_width) +
250
sizeof(ItemIdData); /* include the line pointer */
251
ffactor = (BLCKSZ * 3 / 4) / item_width;
252
/* keep to a sane range */
256
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
257
pg = BufferGetPage(metabuf);
258
_hash_pageinit(pg, BufferGetPageSize(metabuf));
260
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
261
pageopaque->hasho_prevblkno = InvalidBlockNumber;
262
pageopaque->hasho_nextblkno = InvalidBlockNumber;
263
pageopaque->hasho_bucket = -1;
264
pageopaque->hasho_flag = LH_META_PAGE;
265
pageopaque->hasho_filler = HASHO_FILL;
267
metap = (HashMetaPage) pg;
269
metap->hashm_magic = HASH_MAGIC;
270
metap->hashm_version = HASH_VERSION;
271
metap->hashm_ntuples = 0;
272
metap->hashm_nmaps = 0;
273
metap->hashm_ffactor = ffactor;
274
metap->hashm_bsize = BufferGetPageSize(metabuf);
275
/* find largest bitmap array size that will fit in page size */
276
for (i = _hash_log2(metap->hashm_bsize); i > 0; --i)
278
if ((1 << i) <= (metap->hashm_bsize -
279
(MAXALIGN(sizeof(PageHeaderData)) +
280
MAXALIGN(sizeof(HashPageOpaqueData)))))
284
metap->hashm_bmsize = 1 << i;
285
metap->hashm_bmshift = i + BYTE_TO_BIT;
286
Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1));
288
metap->hashm_procid = index_getprocid(rel, 1, HASHPROC);
291
* We initialize the index with two buckets, 0 and 1, occupying
292
* physical blocks 1 and 2. The first freespace bitmap page is in
295
metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */
296
metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */
298
MemSet((char *) metap->hashm_spares, 0, sizeof(metap->hashm_spares));
299
MemSet((char *) metap->hashm_mapp, 0, sizeof(metap->hashm_mapp));
301
metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */
302
metap->hashm_ovflpoint = 1;
303
metap->hashm_firstfree = 0;
306
* Initialize the first two buckets
308
for (i = 0; i <= 1; i++)
310
buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
311
pg = BufferGetPage(buf);
312
_hash_pageinit(pg, BufferGetPageSize(buf));
313
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
314
pageopaque->hasho_prevblkno = InvalidBlockNumber;
315
pageopaque->hasho_nextblkno = InvalidBlockNumber;
316
pageopaque->hasho_bucket = i;
317
pageopaque->hasho_flag = LH_BUCKET_PAGE;
318
pageopaque->hasho_filler = HASHO_FILL;
319
_hash_wrtbuf(rel, buf);
323
* Initialize first bitmap page. Can't do this until we create the
324
* first two buckets, else smgr will complain.
326
_hash_initbitmap(rel, metap, 3);
329
_hash_wrtbuf(rel, metabuf);
333
* _hash_pageinit() -- Initialize a new hash index page.
336
_hash_pageinit(Page page, Size size)
338
Assert(PageIsNew(page));
339
PageInit(page, size, sizeof(HashPageOpaqueData));
343
* Attempt to expand the hash table by creating one new bucket.
345
* This will silently do nothing if it cannot get the needed locks.
347
* The caller should hold no locks on the hash index.
349
* The caller must hold a pin, but no lock, on the metapage buffer.
350
* The buffer is returned in the same state.
353
_hash_expandtable(Relation rel, Buffer metabuf)
359
BlockNumber start_oblkno;
360
BlockNumber start_nblkno;
366
* Obtain the page-zero lock to assert the right to begin a split (see
369
* Note: deadlock should be impossible here. Our own backend could only
370
* be holding bucket sharelocks due to stopped indexscans; those will
371
* not block other holders of the page-zero lock, who are only
372
* interested in acquiring bucket sharelocks themselves. Exclusive
373
* bucket locks are only taken here and in hashbulkdelete, and neither
374
* of these operations needs any additional locks to complete. (If,
375
* due to some flaw in this reasoning, we manage to deadlock anyway,
376
* it's okay to error out; the index will be left in a consistent
379
_hash_getlock(rel, 0, HASH_EXCLUSIVE);
381
/* Write-lock the meta page */
382
_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE);
384
metap = (HashMetaPage) BufferGetPage(metabuf);
385
_hash_checkpage(rel, (Page) metap, LH_META_PAGE);
388
* Check to see if split is still needed; someone else might have
389
* already done one while we waited for the lock.
391
* Make sure this stays in sync with_hash_doinsert()
393
if (metap->hashm_ntuples <=
394
(double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1))
398
* Determine which bucket is to be split, and attempt to lock the old
399
* bucket. If we can't get the lock, give up.
401
* The lock protects us against other backends, but not against our own
402
* backend. Must check for active scans separately.
404
* Ideally we would lock the new bucket too before proceeding, but if we
405
* are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping
406
* isn't correct yet. For simplicity we update the metapage first and
407
* then lock. This should be okay because no one else should be
408
* trying to lock the new bucket yet...
410
new_bucket = metap->hashm_maxbucket + 1;
411
old_bucket = (new_bucket & metap->hashm_lowmask);
413
start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
415
if (_hash_has_active_scan(rel, old_bucket))
418
if (!_hash_try_getlock(rel, start_oblkno, HASH_EXCLUSIVE))
422
* Okay to proceed with split. Update the metapage bucket mapping
425
metap->hashm_maxbucket = new_bucket;
427
if (new_bucket > metap->hashm_highmask)
429
/* Starting a new doubling */
430
metap->hashm_lowmask = metap->hashm_highmask;
431
metap->hashm_highmask = new_bucket | metap->hashm_lowmask;
435
* If the split point is increasing (hashm_maxbucket's log base 2
436
* increases), we need to adjust the hashm_spares[] array and
437
* hashm_ovflpoint so that future overflow pages will be created
438
* beyond this new batch of bucket pages.
440
* XXX should initialize new bucket pages to prevent out-of-order page
441
* creation? Don't wanna do it right here though.
443
spare_ndx = _hash_log2(metap->hashm_maxbucket + 1);
444
if (spare_ndx > metap->hashm_ovflpoint)
446
Assert(spare_ndx == metap->hashm_ovflpoint + 1);
447
metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint];
448
metap->hashm_ovflpoint = spare_ndx;
451
/* now we can compute the new bucket's primary block number */
452
start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
454
Assert(!_hash_has_active_scan(rel, new_bucket));
456
if (!_hash_try_getlock(rel, start_nblkno, HASH_EXCLUSIVE))
457
elog(PANIC, "could not get lock on supposedly new bucket");
460
* Copy bucket mapping info now; this saves re-accessing the meta page
461
* inside _hash_splitbucket's inner loop. Note that once we drop the
462
* split lock, other splits could begin, so these values might be out
463
* of date before _hash_splitbucket finishes. That's okay, since all
464
* it needs is to tell which of these two buckets to map hashkeys
467
maxbucket = metap->hashm_maxbucket;
468
highmask = metap->hashm_highmask;
469
lowmask = metap->hashm_lowmask;
471
/* Write out the metapage and drop lock, but keep pin */
472
_hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK);
474
/* Release split lock; okay for other splits to occur now */
475
_hash_droplock(rel, 0, HASH_EXCLUSIVE);
477
/* Relocate records to the new bucket */
478
_hash_splitbucket(rel, metabuf, old_bucket, new_bucket,
479
start_oblkno, start_nblkno,
480
maxbucket, highmask, lowmask);
482
/* Release bucket locks, allowing others to access them */
483
_hash_droplock(rel, start_oblkno, HASH_EXCLUSIVE);
484
_hash_droplock(rel, start_nblkno, HASH_EXCLUSIVE);
488
/* Here if decide not to split or fail to acquire old bucket lock */
491
/* We didn't write the metapage, so just drop lock */
492
_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
494
/* Release split lock */
495
_hash_droplock(rel, 0, HASH_EXCLUSIVE);
500
* _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket'
502
* We are splitting a bucket that consists of a base bucket page and zero
503
* or more overflow (bucket chain) pages. We must relocate tuples that
504
* belong in the new bucket, and compress out any free space in the old
507
* The caller must hold exclusive locks on both buckets to ensure that
508
* no one else is trying to access them (see README).
510
* The caller must hold a pin, but no lock, on the metapage buffer.
511
* The buffer is returned in the same state. (The metapage is only
512
* touched if it becomes necessary to add or remove overflow pages.)
515
_hash_splitbucket(Relation rel,
519
BlockNumber start_oblkno,
520
BlockNumber start_nblkno,
533
HashPageOpaque oopaque;
534
HashPageOpaque nopaque;
537
OffsetNumber ooffnum;
538
OffsetNumber noffnum;
539
OffsetNumber omaxoffnum;
542
TupleDesc itupdesc = RelationGetDescr(rel);
545
* It should be okay to simultaneously write-lock pages from each
546
* bucket, since no one else can be trying to acquire buffer lock on
547
* pages of either bucket.
549
oblkno = start_oblkno;
550
nblkno = start_nblkno;
551
obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
552
nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
553
opage = BufferGetPage(obuf);
554
npage = BufferGetPage(nbuf);
556
_hash_checkpage(rel, opage, LH_BUCKET_PAGE);
557
oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
559
/* initialize the new bucket's primary page */
560
_hash_pageinit(npage, BufferGetPageSize(nbuf));
561
nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
562
nopaque->hasho_prevblkno = InvalidBlockNumber;
563
nopaque->hasho_nextblkno = InvalidBlockNumber;
564
nopaque->hasho_bucket = nbucket;
565
nopaque->hasho_flag = LH_BUCKET_PAGE;
566
nopaque->hasho_filler = HASHO_FILL;
569
* Partition the tuples in the old bucket between the old bucket and
570
* the new bucket, advancing along the old bucket's overflow bucket
571
* chain and adding overflow pages to the new bucket as needed.
573
ooffnum = FirstOffsetNumber;
574
omaxoffnum = PageGetMaxOffsetNumber(opage);
578
* at each iteration through this loop, each of these variables
579
* should be up-to-date: obuf opage oopaque ooffnum omaxoffnum
582
/* check if we're at the end of the page */
583
if (ooffnum > omaxoffnum)
585
/* at end of page, but check for an(other) overflow page */
586
oblkno = oopaque->hasho_nextblkno;
587
if (!BlockNumberIsValid(oblkno))
591
* we ran out of tuples on this particular page, but we have
592
* more overflow pages; advance to next page.
594
_hash_wrtbuf(rel, obuf);
596
obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
597
opage = BufferGetPage(obuf);
598
_hash_checkpage(rel, opage, LH_OVERFLOW_PAGE);
599
oopaque = (HashPageOpaque) PageGetSpecialPointer(opage);
600
ooffnum = FirstOffsetNumber;
601
omaxoffnum = PageGetMaxOffsetNumber(opage);
606
* Re-hash the tuple to determine which bucket it now belongs in.
608
* It is annoying to call the hash function while holding locks, but
609
* releasing and relocking the page for each tuple is unappealing
612
hitem = (HashItem) PageGetItem(opage, PageGetItemId(opage, ooffnum));
613
itup = &(hitem->hash_itup);
614
datum = index_getattr(itup, 1, itupdesc, &null);
617
bucket = _hash_hashkey2bucket(_hash_datum2hashkey(rel, datum),
618
maxbucket, highmask, lowmask);
620
if (bucket == nbucket)
623
* insert the tuple into the new bucket. if it doesn't fit on
624
* the current page in the new bucket, we must allocate a new
625
* overflow page and place the tuple on that page instead.
627
itemsz = IndexTupleDSize(hitem->hash_itup)
628
+ (sizeof(HashItemData) - sizeof(IndexTupleData));
630
itemsz = MAXALIGN(itemsz);
632
if (PageGetFreeSpace(npage) < itemsz)
634
/* write out nbuf and drop lock, but keep pin */
635
_hash_chgbufaccess(rel, nbuf, HASH_WRITE, HASH_NOLOCK);
636
/* chain to a new overflow page */
637
nbuf = _hash_addovflpage(rel, metabuf, nbuf);
638
npage = BufferGetPage(nbuf);
639
_hash_checkpage(rel, npage, LH_OVERFLOW_PAGE);
640
/* we don't need nopaque within the loop */
643
noffnum = OffsetNumberNext(PageGetMaxOffsetNumber(npage));
644
if (PageAddItem(npage, (Item) hitem, itemsz, noffnum, LP_USED)
645
== InvalidOffsetNumber)
646
elog(ERROR, "failed to add index item to \"%s\"",
647
RelationGetRelationName(rel));
650
* now delete the tuple from the old bucket. after this
651
* section of code, 'ooffnum' will actually point to the
652
* ItemId to which we would point if we had advanced it before
653
* the deletion (PageIndexTupleDelete repacks the ItemId
654
* array). this also means that 'omaxoffnum' is exactly one
655
* less than it used to be, so we really can just decrement it
656
* instead of calling PageGetMaxOffsetNumber.
658
PageIndexTupleDelete(opage, ooffnum);
659
omaxoffnum = OffsetNumberPrev(omaxoffnum);
664
* the tuple stays on this page. we didn't move anything, so
665
* we didn't delete anything and therefore we don't have to
666
* change 'omaxoffnum'.
668
Assert(bucket == obucket);
669
ooffnum = OffsetNumberNext(ooffnum);
674
* We're at the end of the old bucket chain, so we're done
675
* partitioning the tuples. Before quitting, call _hash_squeezebucket
676
* to ensure the tuples remaining in the old bucket (including the
677
* overflow pages) are packed as tightly as possible. The new bucket
680
_hash_wrtbuf(rel, obuf);
681
_hash_wrtbuf(rel, nbuf);
683
_hash_squeezebucket(rel, obucket, start_oblkno);