1/*-------------------------------------------------------------------------
4 * implementation of radix tree (compressed trie) over text
6 * In a text_ops SPGiST index, inner tuples can have a prefix which is the
7 * common prefix of all strings indexed under that tuple. The node labels
8 * represent the next byte of the string(s) after the prefix. Assuming we
9 * always use the longest possible prefix, we will get more than one node
10 * label unless the prefix length is restricted by SPGIST_MAX_PREFIX_LENGTH.
12 * To reconstruct the indexed string for any index entry, concatenate the
13 * inner-tuple prefixes and node labels starting at the root and working
14 * down to the leaf entry, then append the datum in the leaf entry.
15 * (While descending the tree, "level" is the number of bytes reconstructed
18 * However, there are two special cases for node labels: -1 indicates that
19 * there are no more bytes after the prefix-so-far, and -2 indicates that we
20 * had to split an existing allTheSame tuple (in such a case we have to create
21 * a node label that doesn't correspond to any string byte). In either case,
22 * the node label does not contribute anything to the reconstructed string.
24 * Previously, we used a node label of zero for both special cases, but
25 * this was problematic because one can't tell whether a string ending at
26 * the current level can be pushed down into such a child node. For
27 * backwards compatibility, we still support such node labels for reading;
28 * but no new entries will ever be pushed down into a zero-labeled child.
29 * No new entries ever get pushed into a -2-labeled child, either.
32 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
33 * Portions Copyright (c) 1994, Regents of the University of California
36 * src/backend/access/spgist/spgtextproc.c
38 *-------------------------------------------------------------------------
47#include "utils/fmgrprotos.h"
54 * In the worst case, an inner tuple in a text radix tree could have as many
55 * as 258 nodes (one for each possible byte value, plus the two special
56 * cases). Each node can take 16 bytes on MAXALIGN=8 machines. The inner
57 * tuple must fit on an index page of size BLCKSZ. Rather than assuming we
58 * know the exact amount of overhead imposed by page headers, tuple headers,
59 * etc, we leave 100 bytes for that (the actual overhead should be no more
60 * than 56 bytes at this writing, so there is slop in this number).
61 * So we can safely create prefixes up to BLCKSZ - 258 * 16 - 100 bytes long.
62 * Unfortunately, because 258 * 16 is over 4K, there is no safe prefix length
63 * when BLCKSZ is less than 8K; it is always possible to get "SPGiST inner
64 * tuple size exceeds maximum" if there are too many distinct next-byte values
65 * at a given place in the tree. Since use of nonstandard block sizes appears
66 * to be negligible in the field, we just live with that fact for now,
67 * choosing a max prefix size of 32 bytes when BLCKSZ is configured smaller
70 #define SPGIST_MAX_PREFIX_LENGTH Max((int) (BLCKSZ - 258 * 16 - 100), 32)
73 * Strategy for collation aware operator on text is equal to btree strategy
76 * Current collation aware strategies and their corresponding btree strategies:
77 * 11 BTLessStrategyNumber
78 * 12 BTLessEqualStrategyNumber
79 * 14 BTGreaterEqualStrategyNumber
80 * 15 BTGreaterStrategyNumber
82 #define SPG_STRATEGY_ADDITION (10)
83 #define SPG_IS_COLLATION_AWARE_STRATEGY(s) ((s) > SPG_STRATEGY_ADDITION \
84 && (s) != RTPrefixStrategyNumber)
86/* Struct for sorting values in picksplit */
98 /* spgConfigIn *cfgin = (spgConfigIn *) PG_GETARG_POINTER(0); */
104 cfg->
longValuesOK =
true;
/* suffixing will shorten long values */
109 * Form a text datum from the given not-necessarily-null-terminated string,
110 * using short varlena header format if possible
135 * Find the length of the common prefix of a and b
142 while (
i < lena &&
i < lenb && *
a == *
b)
153 * Binary search an array of int16 datums for a match to c
155 * On success, *i gets the match location; on failure, it gets where to insert
163 while (StopLow < StopHigh)
165 int StopMiddle = (StopLow + StopHigh) >> 1;
169 StopHigh = StopMiddle;
171 StopLow = StopMiddle + 1;
191 char *prefixStr = NULL;
197 /* Check for prefix match, set nodeChar to first byte after prefix */
210 if (commonLen == prefixSize)
212 if (inSize - in->
level > commonLen)
213 nodeChar = *(
unsigned char *) (inStr + in->
level + commonLen);
219 /* Must split tuple because incoming value doesn't match prefix */
240 if (prefixSize - commonLen == 1)
249 prefixSize - commonLen - 1);
255 else if (inSize > in->
level)
257 nodeChar = *(
unsigned char *) (inStr + in->
level);
264 /* Look up nodeChar in the node label array */
268 * Descend to existing node. (If in->allTheSame, the core code will
269 * ignore our nodeN specification here, but that's OK. We still have
270 * to provide the correct levelAdd and restDatum values, and those are
271 * the same regardless of which node gets chosen by core.)
277 levelAdd = commonLen;
281 if (inSize - in->
level - levelAdd > 0)
284 inSize - in->
level - levelAdd);
292 * Can't use AddNode action, so split the tuple. The upper tuple has
293 * the same prefix as before and uses a dummy node label -2 for the
294 * lower tuple. The lower tuple has no prefix and the same node
295 * labels as the original tuple.
297 * Note: it might seem tempting to shorten the upper tuple's prefix,
298 * if it has one, then use its last byte as label for the lower tuple.
299 * But that doesn't win since we know the incoming value matches the
300 * whole prefix: we'd just end up splitting the lower tuple again.
313 /* Add a node for the not-previously-seen nodeChar value */
322/* qsort comparator to sort spgNodePtr structs by "c" */
342 /* Identify longest common prefix, if any */
344 for (
i = 1;
i < in->
nTuples && commonLen > 0;
i++)
357 * Limit the prefix length, if necessary, to ensure that the resulting
358 * inner tuple will fit on a page.
362 /* Set node prefix to be that string, if it's not empty */
373 /* Extract the node label (first non-common byte) from each value */
381 nodes[
i].
c = *(
unsigned char *) (
VARDATA_ANY(texti) + commonLen);
383 nodes[
i].
c = -1;
/* use -1 if string is all common */
389 * Sort by label values so that we can group the values into nodes. This
390 * also ensures that the nodes are ordered by label value, allowing the
391 * use of binary search in searchChar.
395 /* And emit results */
406 if (
i == 0 || nodes[
i].
c != nodes[
i - 1].
c)
431 text *reconstructedValue;
434 text *prefixText = NULL;
439 * Reconstruct values represented at this tuple, including parent data,
440 * prefix of this tuple if any, and the node label if it's non-dummy.
441 * in->level should be the length of the previously reconstructed value,
442 * and the number of bytes added here is prefixSize or prefixSize + 1.
444 * Note: we assume that in->reconstructedValue isn't toasted and doesn't
445 * have a short varlena header. This is okay because it must have been
446 * created by a previous invocation of this routine, and we always emit
447 * long-format reconstructed values.
450 Assert(reconstructedValue == NULL ? in->
level == 0 :
453 maxReconstrLen = in->
level + 1;
458 maxReconstrLen += prefixSize;
472 /* last byte of reconstrText will be filled in below */
475 * Scan the child nodes. For each one, complete the reconstructed value
476 * and see if it's consistent with the query. If so, emit an entry into
491 /* If nodeChar is a dummy value, don't include it in data */
493 thisLen = maxReconstrLen - 1;
496 ((
unsigned char *)
VARDATA(reconstrText))[maxReconstrLen - 1] = nodeChar;
497 thisLen = maxReconstrLen;
508 * If it's a collation-aware operator, but the collation is C, we
509 * can treat it as non-collation-aware. With non-C collation we
510 * need to traverse whole tree :-( so there's no point in making
511 * any check here. (Note also that our reconstructed value may
512 * well end with a partial multibyte character, so that applying
513 * any encoding-sensitive test to it would be risky anyhow.)
527 Min(inSize, thisLen));
537 if (r != 0 || inSize < thisLen)
550 elog(
ERROR,
"unrecognized strategy number: %d",
556 break;
/* no need to consider remaining conditions */
578 int level = in->
level;
580 *reconstrValue = NULL;
586 /* all tests are exact */
591 /* As above, in->reconstructedValue isn't toasted or short. */
595 Assert(reconstrValue == NULL ? level == 0 :
598 /* Reconstruct the full string represented by this leaf tuple */
602 fullValue =
VARDATA(reconstrValue);
612 memcpy(fullValue,
VARDATA(reconstrValue), level);
619 /* Perform the required comparison(s) */
631 * if level >= length of query then reconstrValue must begin with
632 * query (prefix) string, so we don't need to check it again.
634 res = (level >= queryLen) ||
640 if (!res)
/* no need to consider remaining conditions */
648 /* Collation-aware comparison */
651 /* If asserts enabled, verify encoding of reconstructed string */
660 /* Non-collation-aware comparison */
661 r = memcmp(fullValue,
VARDATA_ANY(query),
Min(queryLen, fullLen));
665 if (queryLen > fullLen)
667 else if (queryLen < fullLen)
690 elog(
ERROR,
"unrecognized strategy number: %d",
697 break;
/* no need to consider remaining conditions */
Datum datumCopy(Datum value, bool typByVal, int typLen)
Datum DirectFunctionCall2Coll(PGFunction func, Oid collation, Datum arg1, Datum arg2)
#define DatumGetTextPP(X)
#define PG_GETARG_POINTER(n)
#define PG_GET_COLLATION()
#define PG_RETURN_BOOL(x)
Assert(PointerIsAligned(start, uint64))
static int pg_cmp_s16(int16 a, int16 b)
bool pg_verifymbstr(const char *mbstr, int len, bool noError)
pg_locale_t pg_newlocale_from_collation(Oid collid)
#define qsort(a, b, c, d)
static bool DatumGetBool(Datum X)
static Datum PointerGetDatum(const void *X)
static Datum Int16GetDatum(int16 X)
static Pointer DatumGetPointer(Datum X)
static int16 DatumGetInt16(Datum X)
Datum spg_text_config(PG_FUNCTION_ARGS)
static int commonPrefix(const char *a, const char *b, int lena, int lenb)
#define SPG_IS_COLLATION_AWARE_STRATEGY(s)
static int cmpNodePtr(const void *a, const void *b)
#define SPGIST_MAX_PREFIX_LENGTH
struct spgNodePtr spgNodePtr
#define SPG_STRATEGY_ADDITION
Datum spg_text_leaf_consistent(PG_FUNCTION_ARGS)
Datum spg_text_inner_consistent(PG_FUNCTION_ARGS)
static Datum formTextDatum(const char *data, int datalen)
Datum spg_text_choose(PG_FUNCTION_ARGS)
static bool searchChar(Datum *nodeLabels, int nNodes, int16 c, int *i)
Datum spg_text_picksplit(PG_FUNCTION_ARGS)
#define RTPrefixStrategyNumber
#define BTGreaterStrategyNumber
#define BTLessStrategyNumber
#define BTEqualStrategyNumber
#define BTLessEqualStrategyNumber
#define BTGreaterEqualStrategyNumber
StrategyNumber sk_strategy
spgChooseResultType resultType
struct spgChooseOut::@53::@54 matchNode
struct spgChooseOut::@53::@56 splitTuple
union spgChooseOut::@53 result
struct spgChooseOut::@53::@55 addNode
Datum * reconstructedValues
static Size VARSIZE_ANY_EXHDR(const void *PTR)
static char * VARDATA(const void *PTR)
static char * VARDATA_ANY(const void *PTR)
static void SET_VARSIZE_SHORT(void *PTR, Size len)
static void SET_VARSIZE(void *PTR, Size len)
int varstr_cmp(const char *arg1, int len1, const char *arg2, int len2, Oid collid)
Datum text_starts_with(PG_FUNCTION_ARGS)