1
/*-------------------------------------------------------------------------
4
* routines for handling GIN entry tree pages.
7
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8
* Portions Copyright (c) 1994, Regents of the University of California
11
* src/backend/access/gin/ginentrypage.c
12
*-------------------------------------------------------------------------
17
#include "access/gin_private.h"
18
#include "access/ginxlog.h"
19
#include "access/xloginsert.h"
20
#include "miscadmin.h"
21
#include "utils/rel.h"
23
static void entrySplitPage(GinBtree btree, Buffer origbuf,
25
GinBtreeEntryInsertData *insertData,
26
BlockNumber updateblkno,
27
Page *newlpage, Page *newrpage);
30
* Form a tuple for entry tree.
32
* If the tuple would be too big to be stored, function throws a suitable
33
* error if errorTooBig is true, or returns NULL if errorTooBig is false.
35
* See src/backend/access/gin/README for a description of the index tuple
36
* format that is being built here. We build on the assumption that we
37
* are making a leaf-level key entry containing a posting list of nipd items.
38
* If the caller is actually trying to make a posting-tree entry, non-leaf
39
* entry, or pending-list entry, it should pass dataSize = 0 and then overwrite
40
* the t_tid fields as necessary. In any case, 'data' can be NULL to skip
41
* filling in the posting list; the caller is responsible for filling it
42
* afterwards if data = NULL and nipd > 0.
45
GinFormTuple(GinState *ginstate,
46
OffsetNumber attnum, Datum key, GinNullCategory category,
47
Pointer data, Size dataSize, int nipd,
55
/* Build the basic tuple: optional column number, plus key datum */
59
isnull[0] = (category != GIN_CAT_NORM_KEY);
63
datums[0] = UInt16GetDatum(attnum);
66
isnull[1] = (category != GIN_CAT_NORM_KEY);
69
itup = index_form_tuple(ginstate->tupdesc[attnum - 1], datums, isnull);
72
* Determine and store offset to the posting list, making sure there is
73
* room for the category byte if needed.
75
* Note: because index_form_tuple MAXALIGNs the tuple size, there may well
76
* be some wasted pad space. Is it worth recomputing the data length to
77
* prevent that? That would also allow us to Assert that the real data
78
* doesn't overlap the GinNullCategory byte, which this code currently
81
newsize = IndexTupleSize(itup);
83
if (IndexTupleHasNulls(itup))
87
Assert(category != GIN_CAT_NORM_KEY);
88
minsize = GinCategoryOffset(itup, ginstate) + sizeof(GinNullCategory);
89
newsize = Max(newsize, minsize);
92
newsize = SHORTALIGN(newsize);
94
GinSetPostingOffset(itup, newsize);
95
GinSetNPosting(itup, nipd);
98
* Add space needed for posting list, if any. Then check that the tuple
99
* won't be too big to store.
103
newsize = MAXALIGN(newsize);
105
if (newsize > GinMaxItemSize)
109
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
110
errmsg("index row size %zu exceeds maximum %zu for index \"%s\"",
111
(Size) newsize, (Size) GinMaxItemSize,
112
RelationGetRelationName(ginstate->index))));
118
* Resize tuple if needed
120
if (newsize != IndexTupleSize(itup))
122
itup = repalloc(itup, newsize);
125
* PostgreSQL 9.3 and earlier did not clear this new space, so we
126
* might find uninitialized padding when reading tuples from disk.
128
memset((char *) itup + IndexTupleSize(itup),
129
0, newsize - IndexTupleSize(itup));
130
/* set new size in tuple header */
131
itup->t_info &= ~INDEX_SIZE_MASK;
132
itup->t_info |= newsize;
136
* Copy in the posting list, if provided
140
char *ptr = GinGetPosting(itup);
142
memcpy(ptr, data, dataSize);
146
* Insert category byte, if needed
148
if (category != GIN_CAT_NORM_KEY)
150
Assert(IndexTupleHasNulls(itup));
151
GinSetNullCategory(itup, ginstate, category);
157
* Read item pointers from leaf entry tuple.
159
* Returns a palloc'd array of ItemPointers. The number of items is returned
163
ginReadTuple(GinState *ginstate, OffsetNumber attnum, IndexTuple itup,
166
Pointer ptr = GinGetPosting(itup);
167
int nipd = GinGetNPosting(itup);
171
if (GinItupIsCompressed(itup))
175
ipd = ginPostingListDecode((GinPostingList *) ptr, &ndecoded);
176
if (nipd != ndecoded)
177
elog(ERROR, "number of items mismatch in GIN entry tuple, %d in tuple header, %d decoded",
187
ipd = (ItemPointer) palloc(sizeof(ItemPointerData) * nipd);
188
memcpy(ipd, ptr, sizeof(ItemPointerData) * nipd);
195
* Form a non-leaf entry tuple by copying the key data from the given tuple,
196
* which can be either a leaf or non-leaf entry tuple.
198
* Any posting list in the source tuple is not copied. The specified child
199
* block number is inserted into t_tid.
202
GinFormInteriorTuple(IndexTuple itup, Page page, BlockNumber childblk)
206
if (GinPageIsLeaf(page) && !GinIsPostingTree(itup))
208
/* Tuple contains a posting list, just copy stuff before that */
209
uint32 origsize = GinGetPostingOffset(itup);
211
origsize = MAXALIGN(origsize);
212
nitup = (IndexTuple) palloc(origsize);
213
memcpy(nitup, itup, origsize);
214
/* ... be sure to fix the size header field ... */
215
nitup->t_info &= ~INDEX_SIZE_MASK;
216
nitup->t_info |= origsize;
220
/* Copy the tuple as-is */
221
nitup = (IndexTuple) palloc(IndexTupleSize(itup));
222
memcpy(nitup, itup, IndexTupleSize(itup));
225
/* Now insert the correct downlink */
226
GinSetDownlink(nitup, childblk);
232
* Entry tree is a "static", ie tuple never deletes from it,
233
* so we don't use right bound, we use rightmost key instead.
236
getRightMostTuple(Page page)
238
OffsetNumber maxoff = PageGetMaxOffsetNumber(page);
240
return (IndexTuple) PageGetItem(page, PageGetItemId(page, maxoff));
244
entryIsMoveRight(GinBtree btree, Page page)
249
GinNullCategory category;
251
if (GinPageRightMost(page))
254
itup = getRightMostTuple(page);
255
attnum = gintuple_get_attrnum(btree->ginstate, itup);
256
key = gintuple_get_key(btree->ginstate, itup, &category);
258
if (ginCompareAttEntries(btree->ginstate,
259
btree->entryAttnum, btree->entryKey, btree->entryCategory,
260
attnum, key, category) > 0)
267
* Find correct tuple in non-leaf page. It supposed that
268
* page correctly chosen and searching value SHOULD be on page
271
entryLocateEntry(GinBtree btree, GinBtreeStack *stack)
276
IndexTuple itup = NULL;
278
Page page = BufferGetPage(stack->buffer);
280
Assert(!GinPageIsLeaf(page));
281
Assert(!GinPageIsData(page));
285
stack->off = FirstOffsetNumber;
286
stack->predictNumber *= PageGetMaxOffsetNumber(page);
287
return btree->getLeftMostChild(btree, page);
290
low = FirstOffsetNumber;
291
maxoff = high = PageGetMaxOffsetNumber(page);
298
OffsetNumber mid = low + ((high - low) / 2);
300
if (mid == maxoff && GinPageRightMost(page))
309
GinNullCategory category;
311
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
312
attnum = gintuple_get_attrnum(btree->ginstate, itup);
313
key = gintuple_get_key(btree->ginstate, itup, &category);
314
result = ginCompareAttEntries(btree->ginstate,
317
btree->entryCategory,
318
attnum, key, category);
324
Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO);
325
return GinGetDownlink(itup);
333
Assert(high >= FirstOffsetNumber && high <= maxoff);
336
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, high));
337
Assert(GinGetDownlink(itup) != GIN_ROOT_BLKNO);
338
return GinGetDownlink(itup);
342
* Searches correct position for value on leaf page.
343
* Page should be correctly chosen.
344
* Returns true if value found on page.
347
entryLocateLeafEntry(GinBtree btree, GinBtreeStack *stack)
349
Page page = BufferGetPage(stack->buffer);
353
Assert(GinPageIsLeaf(page));
354
Assert(!GinPageIsData(page));
358
stack->off = FirstOffsetNumber;
362
low = FirstOffsetNumber;
363
high = PageGetMaxOffsetNumber(page);
367
stack->off = FirstOffsetNumber;
375
OffsetNumber mid = low + ((high - low) / 2);
379
GinNullCategory category;
382
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, mid));
383
attnum = gintuple_get_attrnum(btree->ginstate, itup);
384
key = gintuple_get_key(btree->ginstate, itup, &category);
385
result = ginCompareAttEntries(btree->ginstate,
388
btree->entryCategory,
389
attnum, key, category);
406
entryFindChildPtr(GinBtree btree, Page page, BlockNumber blkno, OffsetNumber storedOff)
409
maxoff = PageGetMaxOffsetNumber(page);
412
Assert(!GinPageIsLeaf(page));
413
Assert(!GinPageIsData(page));
415
/* if page isn't changed, we returns storedOff */
416
if (storedOff >= FirstOffsetNumber && storedOff <= maxoff)
418
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, storedOff));
419
if (GinGetDownlink(itup) == blkno)
423
* we hope, that needed pointer goes to right. It's true if there
426
for (i = storedOff + 1; i <= maxoff; i++)
428
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
429
if (GinGetDownlink(itup) == blkno)
432
maxoff = storedOff - 1;
436
for (i = FirstOffsetNumber; i <= maxoff; i++)
438
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
439
if (GinGetDownlink(itup) == blkno)
443
return InvalidOffsetNumber;
447
entryGetLeftMostPage(GinBtree btree, Page page)
451
Assert(!GinPageIsLeaf(page));
452
Assert(!GinPageIsData(page));
453
Assert(PageGetMaxOffsetNumber(page) >= FirstOffsetNumber);
455
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
456
return GinGetDownlink(itup);
460
entryIsEnoughSpace(GinBtree btree, Buffer buf, OffsetNumber off,
461
GinBtreeEntryInsertData *insertData)
465
Page page = BufferGetPage(buf);
467
Assert(insertData->entry);
468
Assert(!GinPageIsData(page));
470
if (insertData->isDelete)
472
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
474
releasedsz = MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
477
addedsz = MAXALIGN(IndexTupleSize(insertData->entry)) + sizeof(ItemIdData);
479
if (PageGetFreeSpace(page) + releasedsz >= addedsz)
486
* Delete tuple on leaf page if tuples existed and we
487
* should update it, update old child blkno to new right page
488
* if child split occurred
491
entryPreparePage(GinBtree btree, Page page, OffsetNumber off,
492
GinBtreeEntryInsertData *insertData, BlockNumber updateblkno)
494
Assert(insertData->entry);
495
Assert(!GinPageIsData(page));
497
if (insertData->isDelete)
499
Assert(GinPageIsLeaf(page));
500
PageIndexTupleDelete(page, off);
503
if (!GinPageIsLeaf(page) && updateblkno != InvalidBlockNumber)
505
IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
507
GinSetDownlink(itup, updateblkno);
512
* Prepare to insert data on an entry page.
514
* If it will fit, return GPTP_INSERT after doing whatever setup is needed
515
* before we enter the insertion critical section. *ptp_workspace can be
516
* set to pass information along to the execPlaceToPage function.
518
* If it won't fit, perform a page split and return two temporary page
519
* images into *newlpage and *newrpage, with result GPTP_SPLIT.
521
* In neither case should the given page buffer be modified here.
523
* Note: on insertion to an internal node, in addition to inserting the given
524
* item, the downlink of the existing item at stack->off will be updated to
525
* point to updateblkno.
527
static GinPlaceToPageRC
528
entryBeginPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
529
void *insertPayload, BlockNumber updateblkno,
530
void **ptp_workspace,
531
Page *newlpage, Page *newrpage)
533
GinBtreeEntryInsertData *insertData = insertPayload;
534
OffsetNumber off = stack->off;
536
/* If it doesn't fit, deal with split case */
537
if (!entryIsEnoughSpace(btree, buf, off, insertData))
539
entrySplitPage(btree, buf, stack, insertData, updateblkno,
544
/* Else, we're ready to proceed with insertion */
549
* Perform data insertion after beginPlaceToPage has decided it will fit.
551
* This is invoked within a critical section, and XLOG record creation (if
552
* needed) is already started. The target buffer is registered in slot 0.
555
entryExecPlaceToPage(GinBtree btree, Buffer buf, GinBtreeStack *stack,
556
void *insertPayload, BlockNumber updateblkno,
559
GinBtreeEntryInsertData *insertData = insertPayload;
560
Page page = BufferGetPage(buf);
561
OffsetNumber off = stack->off;
564
entryPreparePage(btree, page, off, insertData, updateblkno);
566
placed = PageAddItem(page,
567
(Item) insertData->entry,
568
IndexTupleSize(insertData->entry),
571
elog(ERROR, "failed to add item to index page in \"%s\"",
572
RelationGetRelationName(btree->index));
574
if (RelationNeedsWAL(btree->index))
577
* This must be static, because it has to survive until XLogInsert,
578
* and we can't palloc here. Ugly, but the XLogInsert infrastructure
579
* isn't reentrant anyway.
581
static ginxlogInsertEntry data;
583
data.isDelete = insertData->isDelete;
586
XLogRegisterBufData(0, (char *) &data,
587
offsetof(ginxlogInsertEntry, tuple));
588
XLogRegisterBufData(0, (char *) insertData->entry,
589
IndexTupleSize(insertData->entry));
594
* Split entry page and insert new data.
596
* Returns new temp pages to *newlpage and *newrpage.
597
* The original buffer is left untouched.
600
entrySplitPage(GinBtree btree, Buffer origbuf,
601
GinBtreeStack *stack,
602
GinBtreeEntryInsertData *insertData,
603
BlockNumber updateblkno,
604
Page *newlpage, Page *newrpage)
606
OffsetNumber off = stack->off;
609
separator = InvalidOffsetNumber;
616
Page lpage = PageGetTempPageCopy(BufferGetPage(origbuf));
617
Page rpage = PageGetTempPageCopy(BufferGetPage(origbuf));
618
Size pageSize = PageGetPageSize(lpage);
619
char tupstore[2 * BLCKSZ];
621
entryPreparePage(btree, lpage, off, insertData, updateblkno);
624
* First, append all the existing tuples and the new tuple we're inserting
625
* one after another in a temporary workspace.
627
maxoff = PageGetMaxOffsetNumber(lpage);
629
for (i = FirstOffsetNumber; i <= maxoff; i++)
633
size = MAXALIGN(IndexTupleSize(insertData->entry));
634
memcpy(ptr, insertData->entry, size);
636
totalsize += size + sizeof(ItemIdData);
639
itup = (IndexTuple) PageGetItem(lpage, PageGetItemId(lpage, i));
640
size = MAXALIGN(IndexTupleSize(itup));
641
memcpy(ptr, itup, size);
643
totalsize += size + sizeof(ItemIdData);
646
if (off == maxoff + 1)
648
size = MAXALIGN(IndexTupleSize(insertData->entry));
649
memcpy(ptr, insertData->entry, size);
651
totalsize += size + sizeof(ItemIdData);
655
* Initialize the left and right pages, and copy all the tuples back to
658
GinInitPage(rpage, GinPageGetOpaque(lpage)->flags, pageSize);
659
GinInitPage(lpage, GinPageGetOpaque(rpage)->flags, pageSize);
666
for (i = FirstOffsetNumber; i <= maxoff; i++)
668
itup = (IndexTuple) ptr;
671
* Decide where to split. We try to equalize the pages' total data
672
* size, not number of tuples.
674
if (lsize > totalsize / 2)
676
if (separator == InvalidOffsetNumber)
682
lsize += MAXALIGN(IndexTupleSize(itup)) + sizeof(ItemIdData);
685
if (PageAddItem(page, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
686
elog(ERROR, "failed to add item to index page in \"%s\"",
687
RelationGetRelationName(btree->index));
688
ptr += MAXALIGN(IndexTupleSize(itup));
691
/* return temp pages to caller */
697
* Construct insertion payload for inserting the downlink for given buffer.
700
entryPrepareDownlink(GinBtree btree, Buffer lbuf)
702
GinBtreeEntryInsertData *insertData;
703
Page lpage = BufferGetPage(lbuf);
704
BlockNumber lblkno = BufferGetBlockNumber(lbuf);
707
itup = getRightMostTuple(lpage);
709
insertData = palloc(sizeof(GinBtreeEntryInsertData));
710
insertData->entry = GinFormInteriorTuple(itup, lpage, lblkno);
711
insertData->isDelete = false;
717
* Fills new root by rightest values from child.
718
* Also called from ginxlog, should not use btree
721
ginEntryFillRoot(GinBtree btree, Page root,
722
BlockNumber lblkno, Page lpage,
723
BlockNumber rblkno, Page rpage)
727
itup = GinFormInteriorTuple(getRightMostTuple(lpage), lpage, lblkno);
728
if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
729
elog(ERROR, "failed to add item to index root page");
732
itup = GinFormInteriorTuple(getRightMostTuple(rpage), rpage, rblkno);
733
if (PageAddItem(root, (Item) itup, IndexTupleSize(itup), InvalidOffsetNumber, false, false) == InvalidOffsetNumber)
734
elog(ERROR, "failed to add item to index root page");
739
* Set up GinBtree for entry page access
741
* Note: during WAL recovery, there may be no valid data in ginstate
742
* other than a faked-up Relation pointer; the key datum is bogus too.
745
ginPrepareEntryScan(GinBtree btree, OffsetNumber attnum,
746
Datum key, GinNullCategory category,
749
memset(btree, 0, sizeof(GinBtreeData));
751
btree->index = ginstate->index;
752
btree->rootBlkno = GIN_ROOT_BLKNO;
753
btree->ginstate = ginstate;
755
btree->findChildPage = entryLocateEntry;
756
btree->getLeftMostChild = entryGetLeftMostPage;
757
btree->isMoveRight = entryIsMoveRight;
758
btree->findItem = entryLocateLeafEntry;
759
btree->findChildPtr = entryFindChildPtr;
760
btree->beginPlaceToPage = entryBeginPlaceToPage;
761
btree->execPlaceToPage = entryExecPlaceToPage;
762
btree->fillRoot = ginEntryFillRoot;
763
btree->prepareDownlink = entryPrepareDownlink;
765
btree->isData = false;
766
btree->fullScan = false;
767
btree->isBuild = false;
769
btree->entryAttnum = attnum;
770
btree->entryKey = key;
771
btree->entryCategory = category;