1/*-------------------------------------------------------------------------
4 * utilities routines for the postgres GiST index access method.
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gist/gistutil.c
12 *-------------------------------------------------------------------------
24#include "utils/fmgrprotos.h"
31 * Write itup vector to page, has no control of free space.
49 elog(
ERROR,
"failed to add item to GiST index page, item %d out of %d, size %d bytes",
56 * Check space for itup vector on page
61 unsigned int size = freespace,
87 /* TODO: Consider fillfactor */
92 * Read buffer into itup vector
111 * join two vectors into one
117 memmove(&itvec[*
len], additvec,
sizeof(
IndexTuple) * addlen);
123 * make plain IndexTuple vector
135 for (
i = 0;
i < veclen;
i++)
138 ptr = ret =
palloc(*memlen);
140 for (
i = 0;
i < veclen;
i++)
150 * Make unions of keys in IndexTuple vector (one union datum per index column).
151 * Union Datums are returned into the attr/isnull arrays.
152 * Resulting Datums aren't compressed.
156 Datum *attr,
bool *isnull)
160 int attrsize = 0;
/* silence compiler warning */
168 /* Collect non-null datums for this column */
188 /* If this column was all NULLs, the union is NULL */
198 /* unionFn may expect at least two inputs */
203 /* Make union and store in attr array */
215 * Return an IndexTuple containing the result of applying the "union"
216 * method to the specified IndexTuple vector.
230 * makes union of two key
236 Datum *dst,
bool *dstisnull)
238 /* we need a GistEntryVector with room for exactly 2 elements */
245 int dstsize = 0;
/* silence compiler warning */
249 if (isnull1 && isnull2)
256 if (isnull1 ==
false && isnull2 ==
false)
258 evec->
vector[0] = *entry1;
259 evec->
vector[1] = *entry2;
261 else if (isnull1 ==
false)
263 evec->
vector[0] = *entry1;
264 evec->
vector[1] = *entry1;
268 evec->
vector[0] = *entry2;
269 evec->
vector[1] = *entry2;
283 bool result =
false;
/* silence compiler warning */
293 * Decompress all keys in tuple
313 * Forms union of oldtup and addtup, if union == oldtup then return NULL
318 bool neednew =
false;
337 oldentries +
i, oldisnull[
i],
338 addentries +
i, addisnull[
i],
339 attr +
i, isnull +
i);
342 /* we already need new key, so we can skip check */
346 /* union of key may be NULL if and only if both keys are NULL */
359 /* need to update key */
368 * Search an upper index page for the entry with lowest penalty for insertion
369 * of the new index key contained in "it".
371 * Returns the index of the page entry to insert into.
384 int keep_current_best;
392 /* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */
396 * The index may have multiple columns, and there's a penalty value for
397 * each column. The penalty associated with a column that appears earlier
398 * in the index definition is strictly more important than the penalty of
399 * a column that appears later in the index definition.
401 * best_penalty[j] is the best penalty we have seen so far for column j,
402 * or -1 when we haven't yet examined column j. Array entries to the
403 * right of the first -1 are undefined.
405 best_penalty[0] = -1;
408 * If we find a tuple that's exactly as good as the currently best one, we
409 * could use either one. When inserting a lot of tuples with the same or
410 * similar keys, it's preferable to descend down the same path when
411 * possible, as that's more cache-friendly. On the other hand, if all
412 * inserts land on the same leaf page after a split, we're never going to
413 * insert anything to the other half of the split, and will end up using
414 * only 50% of the available space. Distributing the inserts evenly would
415 * lead to better space usage, but that hurts cache-locality during
416 * insertion. To get the best of both worlds, when we find a tuple that's
417 * exactly as good as the previous best, choose randomly whether to stick
418 * to the old best, or use the new one. Once we decide to stick to the
419 * old best, we keep sticking to it for any subsequent equally good tuples
420 * we might find. This favors tuples with low offsets, but still allows
421 * some inserts to go to other equally-good subtrees.
423 * keep_current_best is -1 if we haven't yet had to make a random choice
424 * whether to keep the current best tuple. If we have done so, and
425 * decided to keep it, keep_current_best is 1; if we've decided to
426 * replace, keep_current_best is 0. (This state will be reset to -1 as
427 * soon as we've made the replacement, but sometimes we make the choice in
428 * advance of actually finding a replacement best tuple.)
430 keep_current_best = -1;
433 * Loop over tuples on page.
446 /* Loop over index attributes. */
453 /* Compute penalty for this column. */
459 &identry[
j], isnull[
j]);
461 zero_penalty =
false;
463 if (best_penalty[
j] < 0 || usize < best_penalty[
j])
466 * New best penalty for column. Tentatively select this tuple
467 * as the target, and record the best penalty. Then reset the
468 * next column's penalty to "unknown" (and indirectly, the
469 * same for all the ones to its right). This will force us to
470 * adopt this tuple's penalty values as the best for all the
471 * remaining columns during subsequent loop iterations.
474 best_penalty[
j] = usize;
477 best_penalty[
j + 1] = -1;
479 /* we have new best, so reset keep-it decision */
480 keep_current_best = -1;
482 else if (best_penalty[
j] == usize)
485 * The current tuple is exactly as good for this column as the
486 * best tuple seen so far. The next iteration of this loop
487 * will compare the next column.
493 * The current tuple is worse for this column than the best
494 * tuple seen so far. Skip the remaining columns and move on
495 * to the next tuple, if any.
497 zero_penalty =
false;
/* so outer loop won't exit */
503 * If we looped past the last column, and did not update "result",
504 * then this tuple is exactly as good as the prior best tuple.
508 if (keep_current_best == -1)
510 /* we didn't make the random choice yet for this old best */
513 if (keep_current_best == 0)
515 /* we choose to use the new tuple */
517 /* choose again if there are even more exactly-as-good ones */
518 keep_current_best = -1;
523 * If we find a tuple with zero penalty for all columns, and we've
524 * decided we don't want to search for another tuple with equal
525 * penalty, there's no need to examine remaining tuples; just break
526 * out of the loop and return it.
530 if (keep_current_best == -1)
532 /* we didn't make the random choice yet for this old best */
535 if (keep_current_best == 1)
544 * initialize a GiST entry with a decompressed version of key
557 /* there may not be a decompress function in opclass */
565 /* decompressFn may just return the given pointer */
576 const Datum *attdata,
const bool *isnull,
bool isleaf)
588 * The offset number on tuples on internal pages is unused. For historical
589 * reasons, it is set to 0xffff.
597 const Datum *attdata,
const bool *isnull,
bool isleaf,
Datum *compatt)
602 * Call the compress method on each attribute.
615 /* there may not be a compress function in opclass */
623 compatt[
i] = cep->
key;
630 * Emplace each included attribute if any.
637 compatt[
i] = attdata[
i];
643 * initialize a GiST entry with fetched value in key field
658 /* fetchFn set 'key', return it to the caller */
663 * Fetch all keys in tuple.
664 * Returns a new HeapTuple containing the originally-indexed data.
690 * If opclass does not provide compress method that could change
691 * original value, att is necessarily stored in original form.
701 * Index-only scans not supported for this column. Since the
702 * planner chose an index-only scan anyway, it is not interested
703 * in this column, and we can replace it with a NULL.
711 * Get each included attribute.
731 (isNullOrig ==
false && isNullAdd ==
false))
738 /* disallow negative or NaN penalty */
739 if (isnan(penalty) || penalty < 0.0)
742 else if (isNullOrig && isNullAdd)
746 /* try to prevent mixing null and non-null values */
754 * Initialize a new index page
770 * Initialize a new index buffer
782 * Verify that a freshly-read page looks sane.
790 * ReadBuffer verifies that every newly-read page passes
791 * PageHeaderIsValid, which means it either contains a reasonably sane
792 * page header or is all-zero. We have to defend against the all-zero
797 (
errcode(ERRCODE_INDEX_CORRUPTED),
798 errmsg(
"index \"%s\" contains unexpected zero page at block %u",
801 errhint(
"Please REINDEX it.")));
804 * Additionally check that the special area looks sane.
808 (
errcode(ERRCODE_INDEX_CORRUPTED),
809 errmsg(
"index \"%s\" contains corrupted page at block %u",
812 errhint(
"Please REINDEX it.")));
817 * Allocate a new page (either by recycling, or by extending the index file)
819 * The returned buffer is already pinned and exclusive-locked
821 * Caller is responsible for initializing the page by calling GISTInitBuffer
828 /* First, try to get a page from FSM */
834 break;
/* nothing left in FSM */
839 * We have to guard against the possibility that someone else already
840 * recycled this page; the buffer may be locked if so.
847 * If the page was never initialized, it's OK to use.
855 * Otherwise, recycle it if deleted, and too old to have any
856 * processes interested in it.
861 * If we are generating WAL for Hot Standby then create a WAL
862 * record that will allow us to conflict with queries running
863 * on standby, in case they have snapshots older than the
875 /* Can't use it, so release buffer and try again */
879 /* Must extend the file */
886/* Can this page be recycled yet? */
895 * The page was deleted, but when? If it was just deleted, a scan
896 * might have seen the downlink to it, and will read the page later.
897 * As long as that can happen, we must keep the deleted page around as
900 * For that check if the deletion XID could still be visible to
901 * anyone. If not, then no scan that's still in progress could have
902 * seen its downlink, and we can recycle it.
926 * gistproperty() -- Check boolean properties of indexes.
928 * This is optional for most AMs, but is required for GiST because the core
929 * property code doesn't support AMPROP_DISTANCE_ORDERABLE. We also handle
930 * AMPROP_RETURNABLE here to save opening the rel to call gistcanreturn.
935 bool *res,
bool *isnull)
942 /* Only answer column-level inquiries */
947 * Currently, GiST distance-ordered scans require that there be a distance
948 * function in the opclass with the default types (i.e. the one loaded
949 * into the relcache entry, see initGISTstate). So we assume that if such
950 * a function exists, then there's a reason for it (rather than grubbing
951 * through all the opfamily's operators to find an ordered one).
953 * Essentially the same code can test whether we support returning the
954 * column data, since that's true if the opclass provides a fetch proc.
969 /* First we need to know the column's opclass. */
977 /* Now look up the opclass family and input datatype. */
984 /* And now we can check whether the function is provided. */
993 * Special case: even without a fetch function, AMPROP_RETURNABLE is true
994 * if the opclass has no compress function.
1011 * Some indexes are not WAL-logged, but we need LSNs to detect concurrent page
1012 * splits anyway. This function provides a fake sequence of LSNs for that
1018 if (rel->
rd_rel->relpersistence == RELPERSISTENCE_TEMP)
1021 * Temporary relations are only accessible in our session, so a simple
1022 * backend-local counter will do.
1031 * WAL-logging on this relation will start after commit, so its LSNs
1032 * must be distinct numbers smaller than the LSN at the next commit.
1033 * Emit a dummy WAL record if insert-LSN hasn't advanced after the
1039 /* Shouldn't be called for WAL-logging relations */
1042 /* No need for an actual record if we already have a distinct LSN */
1052 * Unlogged relations are accessible from other backends, and survive
1053 * (clean) restarts. GetFakeLSNForUnloggedRel() handles that for us.
1055 Assert(rel->
rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED);
1061 * This is a stratnum translation support function for GiST opclasses that use
1062 * the RT*StrategyNumber constants.
1091 * Returns the opclass's private stratnum used for the given compare type.
1093 * Calls the opclass's GIST_TRANSLATE_CMPTYPE_PROC support function, if any,
1094 * and returns the result. Returns InvalidStrategy if the function is not
1103 /* Check whether the function is provided. */
1108 /* Ask the translation function */
@ AMPROP_DISTANCE_ORDERABLE
static bool validate(Port *port, const char *auth)
#define InvalidBlockNumber
BlockNumber BufferGetBlockNumber(Buffer buffer)
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
bool ConditionalLockBuffer(Buffer buffer)
void ReleaseBuffer(Buffer buffer)
void LockBuffer(Buffer buffer, int mode)
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
static Page BufferGetPage(Buffer buffer)
Size PageGetFreeSpace(const PageData *page)
void PageInit(Page page, Size pageSize, Size specialSize)
static bool PageIsEmpty(const PageData *page)
static uint16 PageGetSpecialSize(const PageData *page)
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
static bool PageIsNew(const PageData *page)
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
#define OidIsValid(objectId)
int errhint(const char *fmt,...)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
static float4 get_float4_infinity(void)
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Datum FunctionCall3Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3)
Datum FunctionCall1Coll(FmgrInfo *flinfo, Oid collation, Datum arg1)
#define PG_GETARG_INT32(n)
#define PG_RETURN_UINT16(x)
struct GISTENTRY GISTENTRY
#define GIST_COMPRESS_PROC
#define GistPageIsLeaf(page)
#define GIST_TRANSLATE_CMPTYPE_PROC
#define GIST_DISTANCE_PROC
static FullTransactionId GistPageGetDeleteXid(Page page)
#define GistPageIsDeleted(page)
#define GistPageGetOpaque(page)
#define gistentryinit(e, k, r, pg, o, l)
void gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p, OffsetNumber o, GISTENTRY *attdata, bool *isnull)
bytea * gistoptions(Datum reloptions, bool validate)
Buffer gistNewBuffer(Relation r, Relation heaprel)
Datum gist_translate_cmptype_common(PG_FUNCTION_ARGS)
bool gistproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull)
bool gistPageRecyclable(Page page)
void gistMakeUnionKey(GISTSTATE *giststate, int attno, GISTENTRY *entry1, bool isnull1, GISTENTRY *entry2, bool isnull2, Datum *dst, bool *dstisnull)
void gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, Datum *attr, bool *isnull)
bool gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace)
HeapTuple gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple)
bool gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b)
static Datum gistFetchAtt(GISTSTATE *giststate, int nkey, Datum k, Relation r)
void gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off)
IndexTuple gistFormTuple(GISTSTATE *giststate, Relation r, const Datum *attdata, const bool *isnull, bool isleaf)
IndexTuple * gistextractpage(Page page, int *len)
void gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e, Datum k, Relation r, Page pg, OffsetNumber o, bool l, bool isNull)
bool gistfitpage(IndexTuple *itvec, int len)
IndexTuple gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate)
OffsetNumber gistchoose(Relation r, Page p, IndexTuple it, GISTSTATE *giststate)
XLogRecPtr gistGetFakeLSN(Relation rel)
void gistCompressValues(GISTSTATE *giststate, Relation r, const Datum *attdata, const bool *isnull, bool isleaf, Datum *compatt)
IndexTuple * gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen)
void gistinitpage(Page page, uint32 f)
IndexTuple gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate)
void GISTInitBuffer(Buffer b, uint32 f)
StrategyNumber gisttranslatecmptype(CompareType cmptype, Oid opfamily)
void gistcheckpage(Relation rel, Buffer buf)
IndexTupleData * gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
float gistpenalty(GISTSTATE *giststate, int attno, GISTENTRY *orig, bool isNullOrig, GISTENTRY *add, bool isNullAdd)
XLogRecPtr gistXLogAssignLSN(void)
void gistXLogPageReuse(Relation rel, Relation heaprel, BlockNumber blkno, FullTransactionId deleteXid)
Assert(PointerIsAligned(start, uint64))
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
BlockNumber GetFreeIndexPage(Relation rel)
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
struct ItemIdData ItemIdData
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
IndexTupleData * IndexTuple
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
static Size IndexTupleSize(const IndexTupleData *itup)
bool get_opclass_opfamily_and_input_type(Oid opclass, Oid *opfamily, Oid *opcintype)
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
Oid get_index_column_opclass(Oid index_oid, int attno)
void * repalloc(void *pointer, Size size)
#define InvalidOffsetNumber
#define OffsetNumberNext(offsetNumber)
#define FirstOffsetNumber
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
pg_prng_state pg_global_prng_state
bool pg_prng_bool(pg_prng_state *state)
static Datum PointerGetDatum(const void *X)
static Datum Int16GetDatum(int16 X)
static Datum ObjectIdGetDatum(Oid X)
static uint16 DatumGetUInt16(Datum X)
static Pointer DatumGetPointer(Datum X)
static Datum Int32GetDatum(int32 X)
bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
#define RelationGetRelationName(relation)
#define RelationNeedsWAL(relation)
#define IndexRelationGetNumberOfKeyAttributes(relation)
#define RelationIsPermanent(relation)
void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)
#define RTOverlapStrategyNumber
#define RTEqualStrategyNumber
#define RTLessEqualStrategyNumber
#define RTGreaterEqualStrategyNumber
#define RTGreaterStrategyNumber
#define RTContainedByStrategyNumber
#define RTLessStrategyNumber
FmgrInfo fetchFn[INDEX_MAX_KEYS]
FmgrInfo penaltyFn[INDEX_MAX_KEYS]
Oid supportCollation[INDEX_MAX_KEYS]
FmgrInfo decompressFn[INDEX_MAX_KEYS]
FmgrInfo compressFn[INDEX_MAX_KEYS]
FmgrInfo equalFn[INDEX_MAX_KEYS]
FmgrInfo unionFn[INDEX_MAX_KEYS]
GISTENTRY vector[FLEXIBLE_ARRAY_MEMBER]
#define SearchSysCacheExists4(cacheId, key1, key2, key3, key4)
XLogRecPtr GetXLogInsertRecPtr(void)
XLogRecPtr GetFakeLSNForUnloggedRel(void)
#define XLogStandbyInfoActive()
#define FirstNormalUnloggedLSN
#define XLogRecPtrIsInvalid(r)
#define InvalidXLogRecPtr