1/*-------------------------------------------------------------------------
4 * routines for dealing with posting lists.
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gin/ginpostinglist.c
12 *-------------------------------------------------------------------------
19#ifdef USE_ASSERT_CHECKING
20#define CHECK_ENCODING_ROUNDTRIP
24 * For encoding purposes, item pointers are represented as 64-bit unsigned
25 * integers. The lowest 11 bits represent the offset number, and the next
26 * lowest 32 bits are the block number. That leaves 21 bits unused, i.e.
27 * only 43 low bits are used.
29 * 11 bits is enough for the offset number, because MaxHeapTuplesPerPage <
30 * 2^11 on all supported block sizes. We are frugal with the bits, because
31 * smaller integers use fewer bytes in the varbyte encoding, saving disk
32 * space. (If we get a new table AM in the future that wants to use the full
33 * range of possible offset numbers, we'll need to change this.)
35 * These 43-bit integers are encoded using varbyte encoding. In each byte,
36 * the 7 low bits contain data, while the highest bit is a continuation bit.
37 * When the continuation bit is set, the next byte is part of the same
38 * integer, otherwise this is the last byte of this integer. 43 bits need
39 * at most 7 bytes in this encoding:
43 * 1XXXXXXX 1XXXXYYY 0YYYYYYY
44 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 0YYYYYYY
45 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
46 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0YYYYYYY
47 * 1XXXXXXX 1XXXXYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 1YYYYYYY 0uuuuuuY
49 * X = bits used for offset number
50 * Y = bits used for block number
53 * The bytes are in stored in little-endian order.
55 * An important property of this encoding is that removing an item from list
56 * never increases the size of the resulting compressed posting list. Proof:
58 * Removing number is actually replacement of two numbers with their sum. We
59 * have to prove that varbyte encoding of a sum can't be longer than varbyte
60 * encoding of its summands. Sum of two numbers is at most one bit wider than
61 * the larger of the summands. Widening a number by one bit enlarges its length
62 * in varbyte encoding by at most one byte. Therefore, varbyte encoding of sum
63 * is at most one byte longer than varbyte encoding of larger summand. Lesser
64 * summand is at least one byte, so the sum cannot take more space than the
67 * This property greatly simplifies VACUUM, which can assume that posting
68 * lists always fit on the same page after vacuuming. Note that even though
69 * that holds for removing items from a posting list, you must also be
70 * careful to not cause expansion e.g. when merging uncompressed items on the
71 * page into the compressed lists, when vacuuming.
75 * How many bits do you need to encode offset number? OffsetNumber is a 16-bit
76 * integer, but you can't fit that many items on a page. 11 ought to be more
77 * than enough. It's tempting to derive this from MaxHeapTuplesPerPage, and
78 * use the minimum number of bits, but that would require changing the on-disk
79 * format if MaxHeapTuplesPerPage changes. Better to leave some slack.
81 #define MaxHeapTuplesPerPageBits 11
83/* Max. number of bytes needed to encode the largest supported integer. */
84 #define MaxBytesPerInteger 7
112 * Varbyte-encode 'val' into *ptr. *ptr is incremented to next integer.
117 unsigned char *p = *ptr;
121 *(p++) = 0x80 | (
val & 0x7F);
124 *(p++) = (
unsigned char)
val;
130 * Decode varbyte-encoded integer at *ptr. *ptr is incremented to next integer.
136 unsigned char *p = *ptr;
146 val |= (
c & 0x7F) << 7;
151 val |= (
c & 0x7F) << 14;
156 val |= (
c & 0x7F) << 21;
161 val |= (
c & 0x7F) << 28;
166 val |= (
c & 0x7F) << 35;
169 /* 7th byte, should not have continuation bit */
186 * Encode a posting list.
188 * The encoded list is returned in a palloc'd struct, which will be at most
189 * 'maxsize' bytes in size. The number items in the returned segment is
190 * returned in *nwritten. If it's not equal to nipd, not all the items fit
191 * in 'maxsize', and only the first *nwritten were encoded.
193 * The allocated size of the returned struct is short-aligned, and the padding
194 * byte at the end, if any, is zero.
205 unsigned char *endptr;
214 /* Store the first special item */
215 result->
first = ipd[0];
220 endptr = result->
bytes + maxbytes;
221 for (totalpacked = 1; totalpacked < nipd; totalpacked++)
233 * There are less than 7 bytes left. Have to check if the next
234 * item fits in that space before writing it out.
237 unsigned char *p =
buf;
240 if (p -
buf > (endptr - ptr))
241 break;
/* output is full */
243 memcpy(ptr,
buf, p -
buf);
251 * If we wrote an odd number of bytes, zero out the padding byte at the
258 *nwritten = totalpacked;
263 * Check that the encoded segment decodes back to the original items.
265#if defined (CHECK_ENCODING_ROUNDTRIP)
270 Assert(ndecoded == totalpacked);
280 * Decode a compressed posting list into an array of item pointers.
281 * The number of items is returned in *ndecoded.
292 * Decode multiple posting list segments into an array of item pointers.
293 * The number of items is returned in *ndecoded_out. The segments are stored
294 * one after each other, with total size 'len' bytes.
302 char *endseg = ((
char *) segment) +
len;
305 unsigned char *endptr;
308 * Guess an initial size of the array.
310 nallocated = segment->
nbytes * 2 + 1;
314 while ((
char *) segment < endseg)
316 /* enlarge output array if needed */
317 if (ndecoded >= nallocated)
323 /* copy the first item */
326 result[ndecoded] = segment->
first;
330 ptr = segment->
bytes;
334 /* enlarge output array if needed */
335 if (ndecoded >= nallocated)
350 *ndecoded_out = ndecoded;
355 * Add all item pointers from a bunch of posting lists to a TIDBitmap.
372 * Merge two ordered arrays of itempointers, eliminating any duplicates.
374 * Returns a palloc'd array, and *nmerged is set to the number of items in
375 * the result, after eliminating duplicates.
387 * If the argument arrays don't overlap, we can just append them to each
408 while (aptr -
a < na && bptr -
b < nb)
416 /* only keep one copy of the identical items */
424 while (aptr -
a < na)
427 while (bptr -
b < nb)
430 *nmerged = dptr - dst;
#define SHORTALIGN_DOWN(LEN)
static int ginCompareItemPointers(ItemPointer a, ItemPointer b)
#define GinItemPointerGetOffsetNumber(pointer)
#define SizeOfGinPostingList(plist)
#define GinItemPointerSetBlockNumber(pointer, blkno)
#define GinNextPostingListSegment(cur)
#define GinItemPointerSetOffsetNumber(pointer, offnum)
#define GinItemPointerGetBlockNumber(pointer)
ItemPointer ginPostingListDecodeAllSegments(GinPostingList *segment, int len, int *ndecoded_out)
static uint64 itemptr_to_uint64(const ItemPointer iptr)
int ginPostingListDecodeAllSegmentsToTbm(GinPostingList *ptr, int len, TIDBitmap *tbm)
static void encode_varbyte(uint64 val, unsigned char **ptr)
ItemPointer ginPostingListDecode(GinPostingList *plist, int *ndecoded_out)
GinPostingList * ginCompressPostingList(const ItemPointer ipd, int nipd, int maxsize, int *nwritten)
static uint64 decode_varbyte(unsigned char **ptr)
#define MaxBytesPerInteger
#define MaxHeapTuplesPerPageBits
static void uint64_to_itemptr(uint64 val, ItemPointer iptr)
ItemPointer ginMergeItemPointers(ItemPointerData *a, uint32 na, ItemPointerData *b, uint32 nb, int *nmerged)
Assert(PointerIsAligned(start, uint64))
if(TABLE==NULL||TABLE_index==NULL)
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
ItemPointerData * ItemPointer
struct ItemPointerData ItemPointerData
static bool ItemPointerIsValid(const ItemPointerData *pointer)
void * repalloc(void *pointer, Size size)
void pfree(void *pointer)
#define OffsetNumberIsValid(offsetNumber)
static int cmp(const chr *x, const chr *y, size_t len)
unsigned char bytes[FLEXIBLE_ARRAY_MEMBER]
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)