index 9bf64486242549413b6aafa813a19d178eeef151..ddffd69cb6e603e4df59172f8768643e1f1892a4 100644 (file)
*/
if (sslot.nnumbers == sslot.nvalues + 3)
{
- /* Grab the lowest frequency. */
- minfreq = sslot.numbers[sslot.nnumbers - (sslot.nnumbers - sslot.nvalues)];
+ /* Grab the minimal MCE frequency. */
+ minfreq = sslot.numbers[sslot.nvalues];
mcelems = sslot.values;
mcefreqs = sslot.numbers;
@@ -269,8 +269,11 @@ int_query_opr_selec(ITEM *item, Datum *mcelems, float4 *mcefreqs,
else
{
/*
- * The element is not in MCELEM. Punt, but assume that the
- * selectivity cannot be more than minfreq / 2.
+ * The element is not in MCELEM. Estimate its frequency as half
+ * that of the least-frequent MCE. (We know it cannot be more
+ * than minfreq, and it could be a great deal less. Half seems
+ * like a good compromise.) For probably-historical reasons,
+ * clamp to not more than DEFAULT_EQ_SEL.
*/
selec = Min(DEFAULT_EQ_SEL, minfreq / 2);
}
index 8ea2913d9063215db995abf3a29103a9108d25c2..12b4f3fd36ead9bf353a6c780cd08e37f945ad9a 100644 (file)
@@ -1711,10 +1711,9 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats)
i = Anum_pg_statistic_stanumbers1 - 1;
for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
{
- int nnum = stats->numnumbers[k];
-
- if (nnum > 0)
+ if (stats->stanumbers[k] != NULL)
{
+ int nnum = stats->numnumbers[k];
Datum *numdatums = (Datum *) palloc(nnum * sizeof(Datum));
ArrayType *arry;
@@ -1732,7 +1731,7 @@ update_attstats(Oid relid, bool inh, int natts, VacAttrStats **vacattrstats)
i = Anum_pg_statistic_stavalues1 - 1;
for (k = 0; k < STATISTIC_NUM_SLOTS; k++)
{
- if (stats->numvalues[k] > 0)
+ if (stats->stavalues[k] != NULL)
{
ArrayType *arry;
index 453a5e5c2ea064d02de0b3fce709412222a3e885..6a71ae6452dcd0d79ccc8cd25bf1378482891a45 100644 (file)
@@ -239,8 +239,8 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
}
/*
- * Grab the lowest frequency. compute_tsvector_stats() stored it for us in
- * the one before the last cell of the Numbers array. See ts_typanalyze.c
+ * Grab the lowest MCE frequency. compute_tsvector_stats() stored it for
+ * us in the one before the last cell of the Numbers array.
*/
minfreq = numbers[nnumbers - 2];
@@ -374,8 +374,11 @@ tsquery_opr_selec(QueryItem *item, char *operand,
else
{
/*
- * The element is not in MCELEM. Punt, but assume that the
- * selectivity cannot be more than minfreq / 2.
+ * The element is not in MCELEM. Estimate its frequency as
+ * half that of the least-frequent MCE. (We know it cannot be
+ * more than minfreq, and it could be a great deal less. Half
+ * seems like a good compromise.) For probably-historical
+ * reasons, clamp to not more than DEFAULT_TS_MATCH_SEL.
*/
selec = Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
}
index c5a71331ce8a00a50c6dd0d8d9f5e540740b9c98..93aab00a3cacc5eca7c2ca58245914c28747f4b0 100644 (file)
/*
* compute_tsvector_stats() -- compute statistics for a tsvector column
*
- * This functions computes statistics that are useful for determining @@
+ * This function computes statistics that are useful for determining @@
* operations' selectivity, along with the fraction of non-null rows and
* average width.
*
@@ -312,7 +312,7 @@ compute_tsvector_stats(VacAttrStats *stats,
/*
* Construct an array of the interesting hashtable items, that is,
* those meeting the cutoff frequency (s - epsilon)*N. Also identify
- * the minimum and maximum frequencies among these items.
+ * the maximum frequency among these items.
*
* Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
* frequency is 9*N / bucket_width.
hash_seq_init(&scan_status, lexemes_tab);
track_len = 0;
- minfreq = lexeme_no;
maxfreq = 0;
while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
{
if (item->frequency > cutoff_freq)
{
sort_table[track_len++] = item;
- minfreq = Min(minfreq, item->frequency);
maxfreq = Max(maxfreq, item->frequency);
}
}
* If we obtained more lexemes than we really want, get rid of those
* with least frequencies. The easiest way is to qsort the array into
* descending frequency order and truncate the array.
+ *
+ * If we did not find more elements than we want, then it is safe to
+ * assume that the stored MCE array will contain every element with
+ * frequency above the cutoff. In that case, rather than storing the
+ * smallest frequency we are keeping, we want to store the minimum
+ * frequency that would have been accepted as a valid MCE. The
+ * selectivity functions can assume that that is an upper bound on the
+ * frequency of elements not present in the array.
+ *
+ * If we found no candidate MCEs at all, we still want to record the
+ * cutoff frequency, since it's still valid to assume that no element
+ * has frequency more than that.
*/
if (num_mcelem < track_len)
{
qsort_interruptible(sort_table, track_len, sizeof(TrackItem *),
trackitem_compare_frequencies_desc, NULL);
- /* reset minfreq to the smallest frequency we're keeping */
+ /* set minfreq to the smallest frequency we're keeping */
minfreq = sort_table[num_mcelem - 1]->frequency;
}
else
+ {
num_mcelem = track_len;
+ /* set minfreq to the minimum frequency above the cutoff */
+ minfreq = cutoff_freq + 1;
+ /* ensure maxfreq is nonzero, too */
+ if (track_len == 0)
+ maxfreq = minfreq;
+ }
/* Generate MCELEM slot entry */
- if (num_mcelem > 0)
+ if (num_mcelem >= 0)
{
MemoryContext old_context;
Datum *mcelem_values;
index a69a84c2aee33e069a6a9d0965be5778fc19b105..cf6fbf8652cf509f6902a14e7f221b69b42710b1 100644 (file)
@@ -544,12 +544,15 @@ mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
if (numbers)
{
- /* Grab the lowest observed frequency */
+ /* Grab the minimal MCE frequency */
minfreq = numbers[nmcelem];
}
else
{
- /* Without statistics make some default assumptions */
+ /*
+ * Without statistics, use DEFAULT_CONTAIN_SEL (the factor of 2 will
+ * be removed again below).
+ */
minfreq = 2 * (float4) DEFAULT_CONTAIN_SEL;
}
@@ -621,8 +624,11 @@ mcelem_array_contain_overlap_selec(Datum *mcelem, int nmcelem,
else
{
/*
- * The element is not in MCELEM. Punt, but assume that the
- * selectivity cannot be more than minfreq / 2.
+ * The element is not in MCELEM. Estimate its frequency as half
+ * that of the least-frequent MCE. (We know it cannot be more
+ * than minfreq, and it could be a great deal less. Half seems
+ * like a good compromise.) For probably-historical reasons,
+ * clamp to not more than DEFAULT_CONTAIN_SEL.
*/
elem_selec = Min(DEFAULT_CONTAIN_SEL, minfreq / 2);
}
@@ -728,7 +734,7 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
/*
* Grab some of the summary statistics that compute_array_stats() stores:
- * lowest frequency, frequency of null elements, and average distinct
+ * lowest MCE frequency, frequency of null elements, and average distinct
* element count.
*/
minfreq = numbers[nmcelem];
@@ -802,8 +808,11 @@ mcelem_array_contained_selec(Datum *mcelem, int nmcelem,
else
{
/*
- * The element is not in MCELEM. Punt, but assume that the
- * selectivity cannot be more than minfreq / 2.
+ * The element is not in MCELEM. Estimate its frequency as half
+ * that of the least-frequent MCE. (We know it cannot be more
+ * than minfreq, and it could be a great deal less. Half seems
+ * like a good compromise.) For probably-historical reasons,
+ * clamp to not more than DEFAULT_CONTAIN_SEL.
*/
elem_selec[unique_nitems] = Min(DEFAULT_CONTAIN_SEL,
minfreq / 2);
index 6f61629b9778de2f3f89f99816b88761e99b1c1e..560b27f3ca7d62210ed3c2fb2d5daf7c12ec7edf 100644 (file)
@@ -461,7 +461,7 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
/*
* Construct an array of the interesting hashtable items, that is,
* those meeting the cutoff frequency (s - epsilon)*N. Also identify
- * the minimum and maximum frequencies among these items.
+ * the maximum frequency among these items.
*
* Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
* frequency is 9*N / bucket_width.
@@ -473,14 +473,12 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
hash_seq_init(&scan_status, elements_tab);
track_len = 0;
- minfreq = element_no;
maxfreq = 0;
while ((item = (TrackItem *) hash_seq_search(&scan_status)) != NULL)
{
if (item->frequency > cutoff_freq)
{
sort_table[track_len++] = item;
- minfreq = Min(minfreq, item->frequency);
maxfreq = Max(maxfreq, item->frequency);
}
}
@@ -497,19 +495,38 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
* If we obtained more elements than we really want, get rid of those
* with least frequencies. The easiest way is to qsort the array into
* descending frequency order and truncate the array.
+ *
+ * If we did not find more elements than we want, then it is safe to
+ * assume that the stored MCE array will contain every element with
+ * frequency above the cutoff. In that case, rather than storing the
+ * smallest frequency we are keeping, we want to store the minimum
+ * frequency that would have been accepted as a valid MCE. The
+ * selectivity functions can assume that that is an upper bound on the
+ * frequency of elements not present in the array.
+ *
+ * If we found no candidate MCEs at all, we still want to record the
+ * cutoff frequency, since it's still valid to assume that no element
+ * has frequency more than that.
*/
if (num_mcelem < track_len)
{
qsort_interruptible(sort_table, track_len, sizeof(TrackItem *),
trackitem_compare_frequencies_desc, NULL);
- /* reset minfreq to the smallest frequency we're keeping */
+ /* set minfreq to the smallest frequency we're keeping */
minfreq = sort_table[num_mcelem - 1]->frequency;
}
else
+ {
num_mcelem = track_len;
+ /* set minfreq to the minimum frequency above the cutoff */
+ minfreq = cutoff_freq + 1;
+ /* ensure maxfreq is nonzero, too */
+ if (track_len == 0)
+ maxfreq = minfreq;
+ }
/* Generate MCELEM slot entry */
- if (num_mcelem > 0)
+ if (num_mcelem >= 0)
{
MemoryContext old_context;
Datum *mcelem_values;
index 4216e27a8a4f753047ed16b9ebb103e9de0ec467..444dc27dcadee193979d61322347d3b2e84a796e 100644 (file)
@@ -240,6 +240,10 @@ DECLARE_FOREIGN_KEY((starelid, staattnum), pg_attribute, (attrelid, attnum));
* the fraction of non-null rows that contain at least one null element). If
* this member is omitted, the column is presumed to contain no null elements.
*
+ * Starting in v19, the first extra member can be smaller than the smallest
+ * frequency of any stored MCE, indicating that it's known that no element
+ * not present in the MCE array has frequency greater than that value.
+ *
* Note: in current usage for tsvector columns, the stavalues elements are of
* type text, even though their representation within tsvector is not
* exactly text.