1
/*-------------------------------------------------------------------------
4
* routines for dealing with posting lists.
7
* Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
8
* Portions Copyright (c) 1994, Regents of the University of California
11
* src/backend/access/gin/ginpostinglist.c
12
*-------------------------------------------------------------------------
17
#include "access/gin_private.h"
19
#ifdef USE_ASSERT_CHECKING
20
#define CHECK_ENCODING_ROUNDTRIP
24
* For encoding purposes, item pointers are represented as 64-bit unsigned
25
* integers. The lowest 11 bits represent the offset number, and the next
26
* lowest 32 bits are the block number. That leaves 17 bits unused, i.e.
27
* only 43 low bits are used.
29
* These 43-bit integers are encoded using varbyte encoding. In each byte,
30
* the 7 low bits contain data, while the highest bit is a continuation bit.
31
* When the continuation bit is set, the next byte is part of the same
32
* integer, otherwise this is the last byte of this integer. 43 bits fit
33
* conveniently in at most 6 bytes when varbyte encoded (the 6th byte does
34
* not need a continuation bit, because we know the max size to be 43 bits):
38
* 1XXXXXXX 1XXXXYYY 0YYYYYYY
39
* 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
40
* 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
41
* 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY YYYYYYYY
43
* X = bits used for offset number
44
* Y = bits used for block number
46
* The bytes are in stored in little-endian order.
48
* An important property of this encoding is that removing an item from list
49
* never increases the size of the resulting compressed posting list. Proof:
51
* Removing number is actually replacement of two numbers with their sum. We
52
* have to prove that varbyte encoding of a sum can't be longer than varbyte
53
* encoding of its summands. Sum of two numbers is at most one bit wider than
54
* the larger of the summands. Widening a number by one bit enlarges its length
55
* in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
56
* is at most one byte longer than varbyte encoding of larger summand. Lesser
57
* summand is at least one byte, so the sum cannot take more space than the
60
* This property greatly simplifies VACUUM, which can assume that posting
61
* lists always fit on the same page after vacuuming. Note that even though
62
* that holds for removing items from a posting list, you must also be
63
* careful to not cause expansion e.g. when merging uncompressed items on the
64
* page into the compressed lists, when vacuuming.
68
* How many bits do you need to encode offset number? OffsetNumber is a 16-bit
69
* integer, but you can't fit that many items on a page. 11 ought to be more
70
* than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
71
* use the minimum number of bits, but that would require changing the on-disk
72
* format if MaxHeapTuplesPerPage changes. Better to leave some slack.
74
#define MaxHeapTuplesPerPageBits 11
77
itemptr_to_uint64(const ItemPointer iptr)
81
Assert(ItemPointerIsValid(iptr));
82
Assert(GinItemPointerGetOffsetNumber(iptr) < (1 << MaxHeapTuplesPerPageBits));
84
val = GinItemPointerGetBlockNumber(iptr);
85
val <<= MaxHeapTuplesPerPageBits;
86
val |= GinItemPointerGetOffsetNumber(iptr);
92
uint64_to_itemptr(uint64 val, ItemPointer iptr)
94
GinItemPointerSetOffsetNumber(iptr, val & ((1 << MaxHeapTuplesPerPageBits) - 1));
95
val = val >> MaxHeapTuplesPerPageBits;
96
GinItemPointerSetBlockNumber(iptr, val);
98
Assert(ItemPointerIsValid(iptr));
102
* Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
105
encode_varbyte(uint64 val, unsigned char **ptr)
107
unsigned char *p = *ptr;
111
*(p++) = 0x80 | (val & 0x7F);
114
*(p++) = (unsigned char) val;
120
* Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
123
decode_varbyte(unsigned char **ptr)
126
unsigned char *p = *ptr;
134
val |= (c & 0x7F) << 7;
138
val |= (c & 0x7F) << 14;
142
val |= (c & 0x7F) << 21;
146
val |= (c & 0x7F) << 28;
150
val |= (c & 0x7F) << 35;
153
/* last byte, no continuation bit */
169
* Encode a posting list.
171
* The encoded list is returned in a palloc'd struct, which will be at most
172
* 'maxsize' bytes in size. The number items in the returned segment is
173
* returned in *nwritten. If it's not equal to nipd, not all the items fit
174
* in 'maxsize', and only the first *nwritten were encoded.
176
* The allocated size of the returned struct is short-aligned, and the padding
177
* byte at the end, if any, is zero.
180
ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize,
186
GinPostingList *result;
188
unsigned char *endptr;
190
maxsize = SHORTALIGN_DOWN(maxsize);
192
result = palloc(maxsize);
194
maxbytes = maxsize - offsetof(GinPostingList, bytes);
195
Assert(maxbytes > 0);
197
/* Store the first special item */
198
result->first = ipd[0];
200
prev = itemptr_to_uint64(&result->first);
203
endptr = result->bytes + maxbytes;
204
for (totalpacked = 1; totalpacked < nipd; totalpacked++)
206
uint64 val = itemptr_to_uint64(&ipd[totalpacked]);
207
uint64 delta = val - prev;
211
if (endptr - ptr >= 6)
212
encode_varbyte(delta, &ptr);
216
* There are less than 6 bytes left. Have to check if the next
217
* item fits in that space before writing it out.
219
unsigned char buf[6];
220
unsigned char *p = buf;
222
encode_varbyte(delta, &p);
223
if (p - buf > (endptr - ptr))
224
break; /* output is full */
226
memcpy(ptr, buf, p - buf);
231
result->nbytes = ptr - result->bytes;
234
* If we wrote an odd number of bytes, zero out the padding byte at the
237
if (result->nbytes != SHORTALIGN(result->nbytes))
238
result->bytes[result->nbytes] = 0;
241
*nwritten = totalpacked;
243
Assert(SizeOfGinPostingList(result) <= maxsize);
246
* Check that the encoded segment decodes back to the original items.
248
#if defined (CHECK_ENCODING_ROUNDTRIP)
251
ItemPointer tmp = ginPostingListDecode(result, &ndecoded);
254
Assert(ndecoded == totalpacked);
255
for (i = 0; i < ndecoded; i++)
256
Assert(memcmp(&tmp[i], &ipd[i], sizeof(ItemPointerData)) == 0);
265
* Decode a compressed posting list into an array of item pointers.
266
* The number of items is returned in *ndecoded.
269
ginPostingListDecode(GinPostingList *plist, int *ndecoded)
271
return ginPostingListDecodeAllSegments(plist,
272
SizeOfGinPostingList(plist),
277
* Decode multiple posting list segments into an array of item pointers.
278
* The number of items is returned in *ndecoded_out. The segments are stored
279
* one after each other, with total size 'len' bytes.
282
ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
287
char *endseg = ((char *) segment) + len;
290
unsigned char *endptr;
293
* Guess an initial size of the array.
295
nallocated = segment->nbytes * 2 + 1;
296
result = palloc(nallocated * sizeof(ItemPointerData));
299
while ((char *) segment < endseg)
301
/* enlarge output array if needed */
302
if (ndecoded >= nallocated)
305
result = repalloc(result, nallocated * sizeof(ItemPointerData));
308
/* copy the first item */
309
Assert(OffsetNumberIsValid(ItemPointerGetOffsetNumber(&segment->first)));
310
Assert(ndecoded == 0 || ginCompareItemPointers(&segment->first, &result[ndecoded - 1]) > 0);
311
result[ndecoded] = segment->first;
314
val = itemptr_to_uint64(&segment->first);
315
ptr = segment->bytes;
316
endptr = segment->bytes + segment->nbytes;
319
/* enlarge output array if needed */
320
if (ndecoded >= nallocated)
323
result = repalloc(result, nallocated * sizeof(ItemPointerData));
326
val += decode_varbyte(&ptr);
328
uint64_to_itemptr(val, &result[ndecoded]);
331
segment = GinNextPostingListSegment(segment);
335
*ndecoded_out = ndecoded;
340
* Add all item pointers from a bunch of posting lists to a TIDBitmap.
343
ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len,
349
items = ginPostingListDecodeAllSegments(ptr, len, &ndecoded);
350
tbm_add_tuples(tbm, items, ndecoded, false);
357
* Merge two ordered arrays of itempointers, eliminating any duplicates.
359
* Returns a palloc'd array, and *nmerged is set to the number of items in
360
* the result, after eliminating duplicates.
363
ginMergeItemPointers(ItemPointerData *a, uint32 na,
364
ItemPointerData *b, uint32 nb,
367
ItemPointerData *dst;
369
dst = (ItemPointer) palloc((na + nb) * sizeof(ItemPointerData));
372
* If the argument arrays don't overlap, we can just append them to each
375
if (na == 0 || nb == 0 || ginCompareItemPointers(&a[na - 1], &b[0]) < 0)
377
memcpy(dst, a, na * sizeof(ItemPointerData));
378
memcpy(&dst[na], b, nb * sizeof(ItemPointerData));
381
else if (ginCompareItemPointers(&b[nb - 1], &a[0]) < 0)
383
memcpy(dst, b, nb * sizeof(ItemPointerData));
384
memcpy(&dst[nb], a, na * sizeof(ItemPointerData));
389
ItemPointerData *dptr = dst;
390
ItemPointerData *aptr = a;
391
ItemPointerData *bptr = b;
393
while (aptr - a < na && bptr - b < nb)
395
int cmp = ginCompareItemPointers(aptr, bptr);
401
/* only keep one copy of the identical items */
409
while (aptr - a < na)
412
while (bptr - b < nb)
415
*nmerged = dptr - dst;