218{
220 int num_mcelem;
221 int null_elem_cnt = 0;
222 int analyzed_rows = 0;
223
224 /* This is D from the LC algorithm. */
228
229 /* This is the current bucket number from the LC algorithm */
230 int b_current;
231
232 /* This is 'w' from the LC algorithm */
233 int bucket_width;
234 int array_no;
237 int slot_idx;
241
243
244 /*
245 * Invoke analyze.c's standard analysis function to create scalar-style
246 * stats for the column. It will expect its own extra_data pointer, so
247 * temporarily install that.
248 */
252
253 /*
254 * Set up static pointer for use by subroutines. We wait till here in
255 * case std_compute_stats somehow recursively invokes us (probably not
256 * possible, but ...)
257 */
259
260 /*
261 * We want statistics_target * 10 elements in the MCELEM array. This
262 * multiplier is pretty arbitrary, but is meant to reflect the fact that
263 * the number of individual elements tracked in pg_statistic ought to be
264 * more than the number of values for a simple scalar column.
265 */
267
268 /*
269 * We set bucket width equal to num_mcelem / 0.007 as per the comment
270 * above.
271 */
272 bucket_width = num_mcelem * 1000 / 7;
273
274 /*
275 * Create the hashtable. It will be in local memory, so we don't need to
276 * worry about overflowing the initial size. Also we don't need to pay any
277 * attention to locking and memory management.
278 */
284 elements_tab =
hash_create(
"Analyzed elements table",
285 num_mcelem,
286 &elem_hash_ctl,
288
289 /* hashtable for array distinct elements counts */
290 count_hash_ctl.
keysize =
sizeof(int);
293 count_tab =
hash_create(
"Array distinct element count table",
294 64,
295 &count_hash_ctl,
297
298 /* Initialize counters. */
299 b_current = 1;
300 element_no = 0;
301
302 /* Loop over the arrays. */
303 for (array_no = 0; array_no < samplerows; array_no++)
304 {
306 bool isnull;
308 int num_elems;
310 bool *elem_nulls;
311 bool null_present;
313 int64 prev_element_no = element_no;
314 int distinct_count;
315 bool count_item_found;
316
318
319 value = fetchfunc(stats, array_no, &isnull);
320 if (isnull)
321 {
322 /* ignore arrays that are null overall */
323 continue;
324 }
325
326 /* Skip too-large values. */
328 continue;
329 else
330 analyzed_rows++;
331
332 /*
333 * Now detoast the array if needed, and deconstruct into datums.
334 */
336
343 &elem_values, &elem_nulls, &num_elems);
344
345 /*
346 * We loop through the elements in the array and add them to our
347 * tracking hashtable.
348 */
349 null_present = false;
350 for (
j = 0;
j < num_elems;
j++)
351 {
353 bool found;
354
355 /* No null element processing other than flag setting here */
357 {
358 null_present = true;
359 continue;
360 }
361
362 /* Lookup current element in hashtable, adding it if new */
363 elem_value = elem_values[
j];
365 &elem_value,
367
368 if (found)
369 {
370 /* The element value is already on the tracking list */
371
372 /*
373 * The operators we assist ignore duplicate array elements, so
374 * count a given distinct element only once per array.
375 */
377 continue;
378
381 }
382 else
383 {
384 /* Initialize new tracking list element */
385
386 /*
387 * If element type is pass-by-reference, we must copy it into
388 * palloc'd space, so that we can release the array below. (We
389 * do this so that the space needed for element values is
390 * limited by the size of the hashtable; if we kept all the
391 * array values around, it could be much more.)
392 */
396
398 item->
delta = b_current - 1;
400 }
401
402 /* element_no is the number of elements processed (ie N) */
403 element_no++;
404
405 /* We prune the D structure after processing each bucket */
406 if (element_no % bucket_width == 0)
407 {
409 b_current++;
410 }
411 }
412
413 /* Count null element presence once per array. */
414 if (null_present)
415 null_elem_cnt++;
416
417 /* Update frequency of the particular array distinct element count. */
418 distinct_count = (int) (element_no - prev_element_no);
421 &count_item_found);
422
423 if (count_item_found)
425 else
427
428 /* Free memory allocated while detoasting. */
433 }
434
435 /* Skip pg_statistic slots occupied by standard statistics */
436 slot_idx = 0;
437 while (slot_idx < STATISTIC_NUM_SLOTS && stats->stakind[slot_idx] != 0)
438 slot_idx++;
440 elog(
ERROR,
"insufficient pg_statistic slots for array stats");
441
442 /* We can only compute real stats if we found some non-null values. */
443 if (analyzed_rows > 0)
444 {
445 int nonnull_cnt = analyzed_rows;
446 int count_items_count;
449 int track_len;
452 maxfreq;
453
454 /*
455 * We assume the standard stats code already took care of setting
456 * stats_valid, stanullfrac, stawidth, stadistinct. We'd have to
457 * re-compute those values if we wanted to not store the standard
458 * stats.
459 */
460
461 /*
462 * Construct an array of the interesting hashtable items, that is,
463 * those meeting the cutoff frequency (s - epsilon)*N. Also identify
464 * the maximum frequency among these items.
465 *
466 * Since epsilon = s/10 and bucket_width = 1/epsilon, the cutoff
467 * frequency is 9*N / bucket_width.
468 */
469 cutoff_freq = 9 * element_no / bucket_width;
470
473
475 track_len = 0;
476 maxfreq = 0;
478 {
480 {
481 sort_table[track_len++] = item;
483 }
484 }
486
487 /* emit some statistics for debug purposes */
488 elog(
DEBUG3,
"compute_array_stats: target # mces = %d, "
489 "bucket width = %d, "
491 "usable entries = %d",
492 num_mcelem, bucket_width, element_no,
i, track_len);
493
494 /*
495 * If we obtained more elements than we really want, get rid of those
496 * with least frequencies. The easiest way is to qsort the array into
497 * descending frequency order and truncate the array.
498 *
499 * If we did not find more elements than we want, then it is safe to
500 * assume that the stored MCE array will contain every element with
501 * frequency above the cutoff. In that case, rather than storing the
502 * smallest frequency we are keeping, we want to store the minimum
503 * frequency that would have been accepted as a valid MCE. The
504 * selectivity functions can assume that that is an upper bound on the
505 * frequency of elements not present in the array.
506 *
507 * If we found no candidate MCEs at all, we still want to record the
508 * cutoff frequency, since it's still valid to assume that no element
509 * has frequency more than that.
510 */
511 if (num_mcelem < track_len)
512 {
515 /* set minfreq to the smallest frequency we're keeping */
516 minfreq = sort_table[num_mcelem - 1]->
frequency;
517 }
518 else
519 {
520 num_mcelem = track_len;
521 /* set minfreq to the minimum frequency above the cutoff */
522 minfreq = cutoff_freq + 1;
523 /* ensure maxfreq is nonzero, too */
524 if (track_len == 0)
525 maxfreq = minfreq;
526 }
527
528 /* Generate MCELEM slot entry */
529 if (num_mcelem >= 0)
530 {
532 Datum *mcelem_values;
534
535 /*
536 * We want to store statistics sorted on the element value using
537 * the element type's default comparison function. This permits
538 * fast binary searches in selectivity estimation functions.
539 */
542
543 /* Must copy the target values into anl_context */
545
546 /*
547 * We sorted statistics on the element value, but we want to be
548 * able to find the minimal and maximal frequencies without going
549 * through all the values. We also want the frequency of null
550 * elements. Store these three values at the end of mcelem_freqs.
551 */
554
555 /*
556 * See comments above about use of nonnull_cnt as the divisor for
557 * the final frequency estimates.
558 */
559 for (
i = 0;
i < num_mcelem;
i++)
560 {
562
567 (double) nonnull_cnt;
568 }
569 mcelem_freqs[
i++] = (double) minfreq / (
double) nonnull_cnt;
570 mcelem_freqs[
i++] = (double) maxfreq / (
double) nonnull_cnt;
571 mcelem_freqs[
i++] = (double) null_elem_cnt / (
double) nonnull_cnt;
572
574
575 stats->
stakind[slot_idx] = STATISTIC_KIND_MCELEM;
579 /* See above comment about extra stanumber entries */
581 stats->
stavalues[slot_idx] = mcelem_values;
583 /* We are storing values of element type */
588 slot_idx++;
589 }
590
591 /* Generate DECHIST slot entry */
593 if (count_items_count > 0)
594 {
598 int delta;
601
602 /* num_hist must be at least 2 for the loop below to work */
603 num_hist =
Max(num_hist, 2);
604
605 /*
606 * Create an array of DECountItem pointers, and sort them into
607 * increasing count order.
608 */
614 {
615 sorted_count_items[
j++] = count_item;
616 }
620
621 /*
622 * Prepare to fill stanumbers with the histogram, followed by the
623 * average count. This array must be stored in anl_context.
624 */
627 sizeof(
float4) * (num_hist + 1));
628 hist[num_hist] = (double) element_no / (double) nonnull_cnt;
629
630 /*----------
631 * Construct the histogram of distinct-element counts (DECs).
632 *
633 * The object of this loop is to copy the min and max DECs to
634 * hist[0] and hist[num_hist - 1], along with evenly-spaced DECs
635 * in between (where "evenly-spaced" is with reference to the
636 * whole input population of arrays). If we had a complete sorted
637 * array of DECs, one per analyzed row, the i'th hist value would
638 * come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)]
639 * (compare the histogram-making loop in compute_scalar_stats()).
640 * But instead of that we have the sorted_count_items[] array,
641 * which holds unique DEC values with their frequencies (that is,
642 * a run-length-compressed version of the full array). So we
643 * control advancing through sorted_count_items[] with the
644 * variable "frac", which is defined as (x - y) * (num_hist - 1),
645 * where x is the index in the notional DECs array corresponding
646 * to the start of the next sorted_count_items[] element's run,
647 * and y is the index in DECs from which we should take the next
648 * histogram value. We have to advance whenever x <= y, that is
649 * frac <= 0. The x component is the sum of the frequencies seen
650 * so far (up through the current sorted_count_items[] element),
651 * and of course y * (num_hist - 1) = i * (analyzed_rows - 1),
652 * per the subscript calculation above. (The subscript calculation
653 * implies dropping any fractional part of y; in this formulation
654 * that's handled by not advancing until frac reaches 1.)
655 *
656 * Even though frac has a bounded range, it could overflow int32
657 * when working with very large statistics targets, so we do that
658 * math in int64.
659 *----------
660 */
661 delta = analyzed_rows - 1;
662 j = 0;
/* current index in sorted_count_items */
663 /* Initialize frac for sorted_count_items[0]; y is initially 0 */
664 frac = (
int64) sorted_count_items[0]->frequency * (num_hist - 1);
665 for (
i = 0;
i < num_hist;
i++)
666 {
667 while (frac <= 0)
668 {
669 /* Advance, and update x component of frac */
671 frac += (
int64) sorted_count_items[
j]->frequency * (num_hist - 1);
672 }
673 hist[
i] = sorted_count_items[
j]->
count;
674 frac -= delta; /* update y for upcoming i increment */
675 }
676 Assert(
j == count_items_count - 1);
677
678 stats->
stakind[slot_idx] = STATISTIC_KIND_DECHIST;
683 slot_idx++;
684 }
685 }
686
687 /*
688 * We don't need to bother cleaning up any of our temporary palloc's. The
689 * hashtable should also go away, as it used a child memory context.
690 */
691}
#define DatumGetArrayTypeP(X)
static int trackitem_compare_frequencies_desc(const void *e1, const void *e2, void *arg)
static int trackitem_compare_element(const void *e1, const void *e2, void *arg)
static int element_match(const void *key1, const void *key2, Size keysize)
static uint32 element_hash(const void *key, Size keysize)
#define ARRAY_WIDTH_THRESHOLD
static int countitem_compare_count(const void *e1, const void *e2, void *arg)
static ArrayAnalyzeExtraData * array_extra_data
static void prune_element_hashtable(HTAB *elements_tab, int b_current)
void deconstruct_array(ArrayType *array, Oid elmtype, int elmlen, bool elmbyval, char elmalign, Datum **elemsp, bool **nullsp, int *nelemsp)
Datum datumCopy(Datum value, bool typByVal, int typLen)
Size toast_raw_datum_size(Datum value)
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
void * hash_seq_search(HASH_SEQ_STATUS *status)
int64 hash_get_num_entries(HTAB *hashp)
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Assert(PointerIsAligned(start, uint64))
void * MemoryContextAlloc(MemoryContext context, Size size)
void pfree(void *pointer)
MemoryContext CurrentMemoryContext
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
#define STATISTIC_NUM_SLOTS
void qsort_interruptible(void *base, size_t nel, size_t elsize, qsort_arg_comparator cmp, void *arg)
static Datum PointerGetDatum(const void *X)
int16 stakind[STATISTIC_NUM_SLOTS]
MemoryContext anl_context
Oid statypid[STATISTIC_NUM_SLOTS]
Oid staop[STATISTIC_NUM_SLOTS]
Oid stacoll[STATISTIC_NUM_SLOTS]
char statypalign[STATISTIC_NUM_SLOTS]
float4 * stanumbers[STATISTIC_NUM_SLOTS]
bool statypbyval[STATISTIC_NUM_SLOTS]
int16 statyplen[STATISTIC_NUM_SLOTS]
int numvalues[STATISTIC_NUM_SLOTS]
Datum * stavalues[STATISTIC_NUM_SLOTS]
int numnumbers[STATISTIC_NUM_SLOTS]
void vacuum_delay_point(bool is_analyze)