1/*-------------------------------------------------------------------------
4 * POSTGRES multivariate ndistinct coefficients
6 * Estimating number of groups in a combination of columns (e.g. for GROUP BY)
7 * is tricky, and the estimation error is often significant.
9 * The multivariate ndistinct coefficients address this by storing ndistinct
10 * estimates for combinations of the user-specified columns. So for example
11 * given a statistics object on three columns (a,b,c), this module estimates
12 * and stores n-distinct for (a,b), (a,c), (b,c) and (a,b,c). The per-column
13 * estimates are already available in pg_statistic.
16 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
17 * Portions Copyright (c) 1994, Regents of the University of California
20 * src/backend/statistics/mvdistinct.c
22 *-------------------------------------------------------------------------
33#include "utils/fmgrprotos.h"
39 int k,
int *combination);
44/* size of the struct header fields (magic, type, nitems) */
45 #define SizeOfHeader (3 * sizeof(uint32))
47/* size of a serialized ndistinct item (coefficient, natts, atts) */
48 #define SizeOfItem(natts) \
49 (sizeof(double) + sizeof(int) + (natts) * sizeof(AttrNumber))
51/* minimal size of a ndistinct item (with two attributes) */
52 #define MinSizeOfItem SizeOfItem(2)
54/* minimal size of mvndistinct, when all items are minimal */
55 #define MinSizeOfItems(nitems) \
56 (SizeOfHeader + (nitems) * MinSizeOfItem)
58/* Combination generator API */
60/* internal state for generator of k-combinations of n elements */
63 int k;
/* size of the combination */
64 int n;
/* total number of elements */
65 int current;
/* index of the next combination to return */
77 * statext_ndistinct_build
78 * Compute ndistinct coefficient for the combination of attributes.
80 * This computes the ndistinct estimate using the same estimator used
81 * in analyze.c and then computes the coefficient.
83 * To handle expressions easily, we treat them as system attributes with
84 * negative attnums, and offset everything by number of expressions to
85 * allow using Bitmapsets.
93 int numattrs =
data->nattnums;
100 result->
nitems = numcombs;
103 for (k = 2; k <= numattrs; k++)
108 /* generate combinations of K out of N elements */
119 /* translate the indexes to attnums */
120 for (
j = 0;
j < k;
j++)
137 /* must consume exactly the whole output array */
144 * statext_ndistinct_load
145 * Load the ndistinct value for the indicated pg_statistic_ext tuple
158 elog(
ERROR,
"cache lookup failed for statistics object %u", mvoid);
161 Anum_pg_statistic_ext_data_stxdndistinct, &isnull);
164 "requested statistics kind \"%c\" is not yet built for statistics object %u",
165 STATS_EXT_NDISTINCT, mvoid);
175 * statext_ndistinct_serialize
176 * serialize ndistinct to the on-disk bytea format
190 * Base size is size of scalar fields in the struct, plus one base struct
191 * for each item, including number of items for each.
195 /* and also include space for the actual attribute numbers */
211 /* Store the base struct values (magic, type, nitems) */
214 memcpy(tmp, &ndistinct->
type,
sizeof(
uint32));
220 * store number of attributes and attribute numbers for each entry
227 memcpy(tmp, &item.
ndistinct,
sizeof(
double));
228 tmp +=
sizeof(double);
229 memcpy(tmp, &nmembers,
sizeof(
int));
235 /* protect against overflows */
239 /* check we used exactly the expected space */
246 * statext_ndistinct_deserialize
247 * Read an on-disk bytea format MVNDistinct to in-memory format
261 /* we expect at least the basic fields of MVNDistinct struct */
263 elog(
ERROR,
"invalid MVNDistinct size %zu (expected at least %zu)",
266 /* initialize pointer to the data part (skip the varlena header) */
269 /* read the header fields and perform basic sanity checks */
278 elog(
ERROR,
"invalid ndistinct magic %08x (expected %08x)",
281 elog(
ERROR,
"invalid ndistinct type %d (expected %d)",
284 elog(
ERROR,
"invalid zero-length item array in MVNDistinct");
286 /* what minimum bytea size do we expect for those parameters */
289 elog(
ERROR,
"invalid MVNDistinct size %zu (expected at least %zu)",
293 * Allocate space for the ndistinct items (no space for each item's
294 * attnos: those live in bitmapsets allocated separately)
306 /* ndistinct value */
307 memcpy(&item->
ndistinct, tmp,
sizeof(
double));
308 tmp +=
sizeof(double);
310 /* number of attributes */
321 /* still within the bytea */
325 /* we should have consumed the whole bytea exactly */
333 * input routine for type pg_ndistinct
335 * pg_ndistinct is real enough to be a table column, but it has no
336 * operations of its own, and disallows input (just like pg_node_tree).
342 (
errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
343 errmsg(
"cannot accept a value of type %s",
"pg_ndistinct")));
350 * output routine for type pg_ndistinct
352 * Produces a human-readable representation of the value.
389 * binary input routine for type pg_ndistinct
395 (
errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
396 errmsg(
"cannot accept a value of type %s",
"pg_ndistinct")));
403 * binary output routine for type pg_ndistinct
405 * n-distinct is serialized into a bytea value, so let's send that.
414 * ndistinct_for_combination
415 * Estimates number of distinct values in a combination of columns.
417 * This uses the same ndistinct estimator as compute_scalar_stats() in
419 * n*d / (n - f1 + f1*n/N)
421 * except that instead of values in a single column we are dealing with
422 * combination of multiple columns.
426 int k,
int *combination)
437 int numrows =
data->numrows;
442 * In order to determine the number of distinct elements, create separate
443 * values[]/isnull[] arrays with all the data we have, then sort them
444 * using the specified column combination as dimensions. We could try to
445 * sort in place, but it'd probably be more complex and bug-prone.
449 isnull = (
bool *)
palloc0(
sizeof(
bool) * numrows * k);
451 for (
i = 0;
i < numrows;
i++)
454 items[
i].isnull = &isnull[
i * k];
458 * For each dimension, set up sort-support and fill in the values from the
461 * We use the column data types' default sort operators and collations;
462 * perhaps at some point it'd be worth using column-specific collations?
464 for (
i = 0;
i < k;
i++)
476 elog(
ERROR,
"cache lookup failed for ordering operator for type %u",
479 /* prepare the sort function for this dimension */
482 /* accumulate all the data for this dimension into the arrays */
483 for (
j = 0;
j < numrows;
j++)
490 /* We can sort the array now ... */
494 /* ... and count the number of distinct combinations */
499 for (
i = 1;
i < numrows;
i++)
519/* The Duj1 estimator (already used in analyze.c). */
527 numer = (double) numrows * (
double) d;
529 denom = (double) (numrows -
f1) +
530 (double)
f1 * (
double) numrows / totalrows;
532 ndistinct = numer / denom;
534 /* Clamp to sane range in case of roundoff error */
535 if (ndistinct < (
double) d)
536 ndistinct = (
double) d;
538 if (ndistinct > totalrows)
539 ndistinct = totalrows;
541 return floor(ndistinct + 0.5);
546 * computes binomial coefficients using an algorithm that is both
547 * efficient and prevents overflows
555 Assert((k > 0) && (n >= k));
557 /* use symmetry of the binomial coefficients */
561 for (d = 1; d <= k; ++d)
572 * number of combinations, excluding single-value combinations
577 return (1 << n) - (n + 1);
582 * initialize the generator of combinations
584 * The generator produces combinations of K elements in the interval (0..N).
585 * We prebuild all the combinations in this method, which is simpler than
586 * generating them on the fly.
593 Assert((n >= k) && (k > 0));
595 /* allocate the generator state as a single chunk of memory */
600 /* pre-allocate space for all combinations */
601 state->combinations = (
int *)
palloc(
sizeof(
int) * k *
state->ncombinations);
607 /* now actually pre-generate all the combinations of K elements */
610 /* make sure we got the expected number of combinations */
613 /* reset the number, so we start with the first one */
621 * returns the next combination from the prebuilt list
623 * Returns a combination of K array indexes (0 .. N), as specified to
624 * generator_init), or NULL when there are no more combination.
637 * free the internal state of the generator
639 * Releases the generator internal state (pre-built combinations).
649 * generate_combinations_recurse
650 * given a prefix, generate all possible combinations
652 * Given a prefix (first few elements of the combination), generate following
653 * elements recursively. We generate the combinations in lexicographic order,
654 * which eliminates permutations of the same combination.
660 /* If we haven't filled all the elements, simply recurse. */
666 * The values have to be in ascending order, so make sure we start
667 * with the value passed by parameter.
680 /* we got a valid combination, add it to the array */
682 current,
state->k *
sizeof(
int));
688 * generate_combinations
689 * generate all k-combinations of N elements
#define AttributeNumberIsValid(attributeNumber)
static Datum values[MAXATTR]
Datum byteasend(PG_FUNCTION_ARGS)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
int multi_sort_compare(const void *a, const void *b, void *arg)
MultiSortSupport multi_sort_init(int ndims)
void multi_sort_add_dimension(MultiSortSupport mss, int sortdim, Oid oper, Oid collation)
#define PG_GETARG_BYTEA_PP(n)
#define DatumGetByteaPP(X)
#define PG_RETURN_CSTRING(x)
Assert(PointerIsAligned(start, uint64))
#define HeapTupleIsValid(tuple)
void pfree(void *pointer)
void * palloc0(Size size)
Datum pg_ndistinct_out(PG_FUNCTION_ARGS)
static int n_choose_k(int n, int k)
Datum pg_ndistinct_in(PG_FUNCTION_ARGS)
struct CombinationGenerator CombinationGenerator
static double estimate_ndistinct(double totalrows, int numrows, int d, int f1)
static void generate_combinations_recurse(CombinationGenerator *state, int index, int start, int *current)
Datum pg_ndistinct_recv(PG_FUNCTION_ARGS)
MVNDistinct * statext_ndistinct_deserialize(bytea *data)
static double ndistinct_for_combination(double totalrows, StatsBuildData *data, int k, int *combination)
bytea * statext_ndistinct_serialize(MVNDistinct *ndistinct)
static void generate_combinations(CombinationGenerator *state)
MVNDistinct * statext_ndistinct_load(Oid mvoid, bool inh)
static int num_combinations(int n)
MVNDistinct * statext_ndistinct_build(double totalrows, StatsBuildData *data)
#define SizeOfItem(natts)
static void generator_free(CombinationGenerator *state)
static CombinationGenerator * generator_init(int n, int k)
Datum pg_ndistinct_send(PG_FUNCTION_ARGS)
#define MinSizeOfItems(nitems)
static int * generator_next(CombinationGenerator *state)
void qsort_interruptible(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
static Datum BoolGetDatum(bool X)
static Datum ObjectIdGetDatum(Oid X)
#define STATS_NDISTINCT_MAGIC
#define STATS_NDISTINCT_TYPE_BASIC
#define STATS_MAX_DIMENSIONS
void appendStringInfo(StringInfo str, const char *fmt,...)
void appendStringInfoString(StringInfo str, const char *s)
void appendStringInfoChar(StringInfo str, char ch)
void initStringInfo(StringInfo str)
MVNDistinctItem items[FLEXIBLE_ARRAY_MEMBER]
void ReleaseSysCache(HeapTuple tuple)
Datum SysCacheGetAttr(int cacheId, HeapTuple tup, AttrNumber attributeNumber, bool *isNull)
HeapTuple SearchSysCache2(int cacheId, Datum key1, Datum key2)
TypeCacheEntry * lookup_type_cache(Oid type_id, int flags)
static Size VARSIZE_ANY(const void *PTR)
static Size VARSIZE_ANY_EXHDR(const void *PTR)
static char * VARDATA(const void *PTR)
static char * VARDATA_ANY(const void *PTR)
static void SET_VARSIZE(void *PTR, Size len)