git.postgresql.org Git - postgresql.git/commitdiff

git projects / postgresql.git / commitdiff

Separate out bytea sort support from varlena.c

author John Naylor <john.naylor@postgresql.org>

2025年12月16日 08:19:16 +0000 (15:19 +0700)

committer John Naylor <john.naylor@postgresql.org>

2025年12月16日 08:19:16 +0000 (15:19 +0700)

In the wake of commit b45242fd3, bytea_sortsupport() still called out
to varstr_sortsupport(). Treating bytea as a kind of text/varchar
required varstr_sortsupport() to allow for the possibility of
NUL bytes, but only for C collation. This was confusing. For
better separation of concerns, create an independent sortsupport
implementation in bytea.c.

The heuristics for bytea_abbrev_abort() remain the same as for
varstr_abbrev_abort(). It's possible that the bytea case warrants
different treatment, but that is left for future investigation.

In passing, adjust some strange looking comparisons in
varstr_abbrev_abort().

Author: Aleksander Alekseev <aleksander@tigerdata.com>
Reviewed-by: John Naylor <johncnaylorls@gmail.com>
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Discussion: https://postgr.es/m/CAJ7c6TP1bAbEhUJa6+rgceN6QJWMSsxhg1=mqfSN=Nb-n6DAKg@mail.gmail.com

src/backend/utils/adt/bytea.c patch | blob | blame | history

src/backend/utils/adt/varlena.c patch | blob | blame | history

src/tools/pgindent/typedefs.list patch | blob | blame | history

diff --git a/src/backend/utils/adt/bytea.c b/src/backend/utils/adt/bytea.c

index 6e7b914c56395fa4d941d3e8c4b43c9534703904..f8524548e4671439e24a5b6f7c63e42c13265f66 100644 (file)

--- a/src/backend/utils/adt/bytea.c

+++ b/src/backend/utils/adt/bytea.c

@@ -15,18 +15,19 @@

#include "postgres.h"

#include "access/detoast.h"

-#include "catalog/pg_collation_d.h"

-#include "catalog/pg_type_d.h"

+#include "common/hashfn.h"

#include "common/int.h"

#include "fmgr.h"

+#include "lib/hyperloglog.h"

#include "libpq/pqformat.h"

#include "port/pg_bitutils.h"

+#include "port/pg_bswap.h"

#include "utils/builtins.h"

#include "utils/bytea.h"

#include "utils/fmgrprotos.h"

+#include "utils/guc.h"

#include "utils/memutils.h"

#include "utils/sortsupport.h"

-#include "utils/varlena.h"

#include "varatt.h"

/* GUC variable */

@@ -37,6 +38,19 @@ static bytea *bytea_substring(Datum str, int S, int L,

bool length_not_specified);

static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);

+typedef struct

+ bool abbreviate; /* Should we abbreviate keys? */

+ hyperLogLogState abbr_card; /* Abbreviated key cardinality state */

+ hyperLogLogState full_card; /* Full key cardinality state */

+ double prop_card; /* Required cardinality proportion */

+} ByteaSortSupport;

+/* Static function declarations for sort support */

+static int byteafastcmp(Datum x, Datum y, SortSupport ssup);

+static Datum bytea_abbrev_convert(Datum original, SortSupport ssup);

+static bool bytea_abbrev_abort(int memtupcount, SortSupport ssup);

* bytea_catenate

* Guts of byteacat(), broken out so it can be used by other functions

@@ -1001,6 +1015,201 @@ bytea_smaller(PG_FUNCTION_ARGS)

PG_RETURN_BYTEA_P(result);

}

+/*

+ * sortsupport comparison func

+ */

+static int

+byteafastcmp(Datum x, Datum y, SortSupport ssup)

+ bytea *arg1 = DatumGetByteaPP(x);

+ bytea *arg2 = DatumGetByteaPP(y);

+ char *a1p,

+ *a2p;

+ int len1,

+ len2,

+ result;

+ a1p = VARDATA_ANY(arg1);

+ a2p = VARDATA_ANY(arg2);

+ len1 = VARSIZE_ANY_EXHDR(arg1);

+ len2 = VARSIZE_ANY_EXHDR(arg2);

+ result = memcmp(a1p, a2p, Min(len1, len2));

+ if ((result == 0) && (len1 != len2))

+ result = (len1 < len2) ? -1 : 1;

+ /* We can't afford to leak memory here. */

+ if (PointerGetDatum(arg1) != x)

+ pfree(arg1);

+ if (PointerGetDatum(arg2) != y)

+ pfree(arg2);

+ return result;

+/*

+ * Conversion routine for sortsupport. Converts original to abbreviated key

+ * representation. Our encoding strategy is simple -- pack the first 8 bytes

+ * of the bytea data into a Datum (on little-endian machines, the bytes are

+ * stored in reverse order), and treat it as an unsigned integer.

+ */

+static Datum

+bytea_abbrev_convert(Datum original, SortSupport ssup)

+ const size_t max_prefix_bytes = sizeof(Datum);

+ ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;

+ bytea *authoritative = DatumGetByteaPP(original);

+ char *authoritative_data = VARDATA_ANY(authoritative);

+ Datum res;

+ char *pres;

+ int len;

+ uint32 hash;

+ pres = (char *) &res;

+ /* memset(), so any non-overwritten bytes are NUL */

+ memset(pres, 0, max_prefix_bytes);

+ len = VARSIZE_ANY_EXHDR(authoritative);

+ /*

+ * Short byteas will have terminating NUL bytes in the abbreviated datum.

+ * Abbreviated comparison need not make a distinction between these NUL

+ * bytes, and NUL bytes representing actual NULs in the authoritative

+ * representation.

+ *

+ * Hopefully a comparison at or past one abbreviated key's terminating NUL

+ * byte will resolve the comparison without consulting the authoritative

+ * representation; specifically, some later non-NUL byte in the longer

+ * bytea can resolve the comparison against a subsequent terminating NUL

+ * in the shorter bytea. There will usually be what is effectively a

+ * "length-wise" resolution there and then.

+ *

+ * If that doesn't work out -- if all bytes in the longer bytea positioned

+ * at or past the offset of the smaller bytea (first) terminating NUL are

+ * actually representative of NUL bytes in the authoritative binary bytea

+ * (perhaps with some *terminating* NUL bytes towards the end of the

+ * longer bytea iff it happens to still be small) -- then an authoritative

+ * tie-breaker will happen, and do the right thing: explicitly consider

+ * bytea length.

+ */

+ memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));

+ /*

+ * Maintain approximate cardinality of both abbreviated keys and original,

+ * authoritative keys using HyperLogLog. Used as cheap insurance against

+ * the worst case, where we do many string abbreviations for no saving in

+ * full memcmp()-based comparisons. These statistics are used by

+ * bytea_abbrev_abort().

+ *

+ * First, Hash key proper, or a significant fraction of it. Mix in length

+ * in order to compensate for cases where differences are past

+ * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.

+ */

+ hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,

+ Min(len, PG_CACHE_LINE_SIZE)));

+ if (len > PG_CACHE_LINE_SIZE)

+ hash ^= DatumGetUInt32(hash_uint32((uint32) len));

+ addHyperLogLog(&bss->full_card, hash);

+ /* Hash abbreviated key */

+ {

+ uint32 tmp;

+ tmp = DatumGetUInt32(res) ^ (uint32) (DatumGetUInt64(res) >> 32);

+ hash = DatumGetUInt32(hash_uint32(tmp));

+ }

+ addHyperLogLog(&bss->abbr_card, hash);

+ /*

+ * Byteswap on little-endian machines.

+ *

+ * This is needed so that ssup_datum_unsigned_cmp() works correctly on all

+ * platforms.

+ */

+ res = DatumBigEndianToNative(res);

+ /* Don't leak memory here */

+ if (PointerGetDatum(authoritative) != original)

+ pfree(authoritative);

+ return res;

+/*

+ * Callback for estimating effectiveness of abbreviated key optimization, using

+ * heuristic rules. Returns value indicating if the abbreviation optimization

+ * should be aborted, based on its projected effectiveness.

+ *

+ * This is based on varstr_abbrev_abort(), but some comments have been elided

+ * for brevity. See there for more details.

+ */

+static bool

+bytea_abbrev_abort(int memtupcount, SortSupport ssup)

+ ByteaSortSupport *bss = (ByteaSortSupport *) ssup->ssup_extra;

+ double abbrev_distinct,

+ key_distinct;

+ Assert(ssup->abbreviate);

+ /* Have a little patience */

+ if (memtupcount < 100)

+ return false;

+ abbrev_distinct = estimateHyperLogLog(&bss->abbr_card);

+ key_distinct = estimateHyperLogLog(&bss->full_card);

+ /*

+ * Clamp cardinality estimates to at least one distinct value. While

+ * NULLs are generally disregarded, if only NULL values were seen so far,

+ * that might misrepresent costs if we failed to clamp.

+ */

+ if (abbrev_distinct < 1.0)

+ abbrev_distinct = 1.0;

+ if (key_distinct < 1.0)

+ key_distinct = 1.0;

+ if (trace_sort)

+ {

+ double norm_abbrev_card = abbrev_distinct / (double) memtupcount;

+ elog(LOG, "bytea_abbrev: abbrev_distinct after %d: %f "

+ "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",

+ memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,

+ bss->prop_card);

+ }

+ /*

+ * If the number of distinct abbreviated keys approximately matches the

+ * number of distinct original keys, continue with abbreviation.

+ */

+ if (abbrev_distinct > key_distinct * bss->prop_card)

+ {

+ /*

+ * Decay required cardinality aggressively after 10,000 tuples.

+ */

+ if (memtupcount > 10000)

+ bss->prop_card *= 0.65;

+ return false;

+ }

+ /*

+ * Abort abbreviation strategy.

+ */

+ if (trace_sort)

+ elog(LOG, "bytea_abbrev: aborted abbreviation at %d "

+ "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",

+ memtupcount, abbrev_distinct, key_distinct, bss->prop_card);

+ return true;

Datum

bytea_sortsupport(PG_FUNCTION_ARGS)

{

@@ -1009,8 +1218,27 @@ bytea_sortsupport(PG_FUNCTION_ARGS)

oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);

- /* Use generic string SortSupport, forcing "C" collation */

- varstr_sortsupport(ssup, BYTEAOID, C_COLLATION_OID);

+ ssup->comparator = byteafastcmp;

+ /*

+ * Set up abbreviation support if requested.

+ */

+ if (ssup->abbreviate)

+ {

+ ByteaSortSupport *bss;

+ bss = palloc_object(ByteaSortSupport);

+ bss->abbreviate = true;

+ bss->prop_card = 0.20;

+ initHyperLogLog(&bss->abbr_card, 10);

+ initHyperLogLog(&bss->full_card, 10);

+ ssup->ssup_extra = bss;

+ ssup->abbrev_full_comparator = ssup->comparator;

+ ssup->comparator = ssup_datum_unsigned_cmp;

+ ssup->abbrev_converter = bytea_abbrev_convert;

+ ssup->abbrev_abort = bytea_abbrev_abort;

+ }

MemoryContextSwitchTo(oldcontext);

diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index baa5b44ea8dd033c1e7a904084ea1a42fc3fe1d7..8adeb8dadc66e98465001ab87e277359df36bd91 100644 (file)

--- a/src/backend/utils/adt/varlena.c

+++ b/src/backend/utils/adt/varlena.c

@@ -92,7 +92,7 @@ typedef struct

int last_returned; /* Last comparison result (cache) */

bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */

bool collate_c;

- Oid typid; /* Actual datatype (text/bpchar/bytea/name) */

+ Oid typid; /* Actual datatype (text/bpchar/name) */

hyperLogLogState abbr_card; /* Abbreviated key cardinality state */

hyperLogLogState full_card; /* Full key cardinality state */

double prop_card; /* Required cardinality proportion */

@@ -1617,10 +1617,8 @@ bttextsortsupport(PG_FUNCTION_ARGS)

* Includes locale support, and support for BpChar semantics (i.e. removing

* trailing spaces before comparison).

- * Relies on the assumption that text, VarChar, BpChar, and bytea all have the

- * same representation. Callers that always use the C collation (e.g.

- * non-collatable type callers like bytea) may have NUL bytes in their strings;

- * this will not work with any other collation, though.

+ * Relies on the assumption that text, VarChar, and BpChar all have the

+ * same representation.

void

varstr_sortsupport(SortSupport ssup, Oid typid, Oid collid)

@@ -1983,7 +1981,7 @@ varstrfastcmp_locale(char *a1p, int len1, char *a2p, int len2, SortSupport ssup)

* representation. Our encoding strategy is simple -- pack the first 8 bytes

* of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are

* stored in reverse order), and treat it as an unsigned integer. When the "C"

- * locale is used, or in case of bytea, just memcpy() from original instead.

+ * locale is used just memcpy() from original instead.

static Datum

varstr_abbrev_convert(Datum original, SortSupport ssup)

@@ -2010,30 +2008,8 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)

* If we're using the C collation, use memcpy(), rather than strxfrm(), to

- * abbreviate keys. The full comparator for the C locale is always

- * memcmp(). It would be incorrect to allow bytea callers (callers that

- * always force the C collation -- bytea isn't a collatable type, but this

- * approach is convenient) to use strxfrm(). This is because bytea

- * strings may contain NUL bytes. Besides, this should be faster, too.

- *

- * More generally, it's okay that bytea callers can have NUL bytes in

- * strings because abbreviated cmp need not make a distinction between

- * terminating NUL bytes, and NUL bytes representing actual NULs in the

- * authoritative representation. Hopefully a comparison at or past one

- * abbreviated key's terminating NUL byte will resolve the comparison

- * without consulting the authoritative representation; specifically, some

- * later non-NUL byte in the longer string can resolve the comparison

- * against a subsequent terminating NUL in the shorter string. There will

- * usually be what is effectively a "length-wise" resolution there and

- * then.

- *

- * If that doesn't work out -- if all bytes in the longer string

- * positioned at or past the offset of the smaller string's (first)

- * terminating NUL are actually representative of NUL bytes in the

- * authoritative binary string (perhaps with some *terminating* NUL bytes

- * towards the end of the longer string iff it happens to still be small)

- * -- then an authoritative tie-breaker will happen, and do the right

- * thing: explicitly consider string length.

+ * abbreviate keys. The full comparator for the C locale is also

+ * memcmp(). This should be faster than strxfrm().

if (sss->collate_c)

memcpy(pres, authoritative_data, Min(len, max_prefix_bytes));

@@ -2115,9 +2091,6 @@ varstr_abbrev_convert(Datum original, SortSupport ssup)

* strxfrm() blob is itself NUL terminated, leaving no danger of

* misinterpreting any NUL bytes not intended to be interpreted as

* logically representing termination.

- *

- * (Actually, even if there were NUL bytes in the blob it would be

- * okay. See remarks on bytea case above.)

memcpy(pres, sss->buf2, Min(max_prefix_bytes, bsize));

}

@@ -2198,10 +2171,10 @@ varstr_abbrev_abort(int memtupcount, SortSupport ssup)

* NULLs are generally disregarded, if only NULL values were seen so far,

* that might misrepresent costs if we failed to clamp.

- if (abbrev_distinct <= 1.0)

+ if (abbrev_distinct < 1.0)

abbrev_distinct = 1.0;

- if (key_distinct <= 1.0)

+ if (key_distinct < 1.0)

key_distinct = 1.0;

diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list

index 3451538565e8efbff36ef96a271768d0365e51b9..04845d5e6809eb61f6276f42af06e909624f5f13 100644 (file)

--- a/src/tools/pgindent/typedefs.list

+++ b/src/tools/pgindent/typedefs.list

@@ -366,6 +366,7 @@ BulkWriteBuffer

BulkWriteState

BumpBlock

BumpContext

+ByteaSortSupport

CACHESIGN

CAC_state

CCFastEqualFN

This is the main PostgreSQL git repository.

RSS Atom