void ExecChooseHashTableSize (double ntuples, int tupwidth, bool useskew, bool try_combined_hash_mem, int parallel_workers, size_t *space_allowed, int *numbuckets, int *numbatches, int *num_skew_mcvs)

int ExecHashGetSkewBucket (HashJoinTable hashtable, uint32 hashvalue)

void ExecHashEstimate (HashState *node, ParallelContext *pcxt)

void ExecHashInitializeDSM (HashState *node, ParallelContext *pcxt)

void ExecHashInitializeWorker (HashState *node, ParallelWorkerContext *pwcxt)

void ExecHashRetrieveInstrumentation (HashState *node)

void ExecShutdownHash (HashState *node)

void ExecHashAccumInstrumentation (HashInstrumentation *instrument, HashJoinTable hashtable)

Function Documentation

◆ ExecChooseHashTableSize()

void ExecChooseHashTableSize ( double ntuples,

int tupwidth,

bool useskew,

bool try_combined_hash_mem,

int parallel_workers,

size_t * space_allowed,

int * numbuckets,

int * numbatches,

int * num_skew_mcvs

)

Definition at line 657 of file nodeHash.c.

664{

665 int tupsize;

666 double inner_rel_bytes;

667 size_t hash_table_bytes;

668 size_t bucket_bytes;

669 size_t max_pointers;

670 int nbatch = 1;

671 int nbuckets;

672 double dbuckets;

673

674 /* Force a plausible relation size if no info */

675 if (ntuples <= 0.0)

676 ntuples = 1000.0;

677

678 /*

679 * Estimate tupsize based on footprint of tuple in hashtable... note this

680 * does not allow for any palloc overhead. The manipulations of spaceUsed

681 * don't count palloc overhead either.

682 */

683 tupsize = HJTUPLE_OVERHEAD +

684 MAXALIGN(SizeofMinimalTupleHeader) +

685 MAXALIGN(tupwidth);

686 inner_rel_bytes = ntuples * tupsize;

687

688 /*

689 * Compute in-memory hashtable size limit from GUCs.

690 */

691 hash_table_bytes = get_hash_memory_limit();

692

693 /*

694 * Parallel Hash tries to use the combined hash_mem of all workers to

695 * avoid the need to batch. If that won't work, it falls back to hash_mem

696 * per worker and tries to process batches in parallel.

697 */

698 if (try_combined_hash_mem)

699 {

700 /* Careful, this could overflow size_t */

701 double newlimit;

702

703 newlimit = (double) hash_table_bytes * (double) (parallel_workers + 1);

704 newlimit = Min(newlimit, (double) SIZE_MAX);

705 hash_table_bytes = (size_t) newlimit;

706 }

707

708 *space_allowed = hash_table_bytes;

709

710 /*

711 * If skew optimization is possible, estimate the number of skew buckets

712 * that will fit in the memory allowed, and decrement the assumed space

713 * available for the main hash table accordingly.

714 *

715 * We make the optimistic assumption that each skew bucket will contain

716 * one inner-relation tuple. If that turns out to be low, we will recover

717 * at runtime by reducing the number of skew buckets.

718 *

719 * hashtable->skewBucket will have up to 8 times as many HashSkewBucket

720 * pointers as the number of MCVs we allow, since ExecHashBuildSkewHash

721 * will round up to the next power of 2 and then multiply by 4 to reduce

722 * collisions.

723 */

724 if (useskew)

725 {

726 size_t bytes_per_mcv;

727 size_t skew_mcvs;

728

729 /*----------

730 * Compute number of MCVs we could hold in hash_table_bytes

731 *

732 * Divisor is:

733 * size of a hash tuple +

734 * worst-case size of skewBucket[] per MCV +

735 * size of skewBucketNums[] entry +

736 * size of skew bucket struct itself

737 *----------

738 */

739 bytes_per_mcv = tupsize +

740 (8 * sizeof(HashSkewBucket *)) +

741 sizeof(int) +

742 SKEW_BUCKET_OVERHEAD;

743 skew_mcvs = hash_table_bytes / bytes_per_mcv;

744

745 /*

746 * Now scale by SKEW_HASH_MEM_PERCENT (we do it in this order so as

747 * not to worry about size_t overflow in the multiplication)

748 */

749 skew_mcvs = (skew_mcvs * SKEW_HASH_MEM_PERCENT) / 100;

750

751 /* Now clamp to integer range */

752 skew_mcvs = Min(skew_mcvs, INT_MAX);

753

754 *num_skew_mcvs = (int) skew_mcvs;

755

756 /* Reduce hash_table_bytes by the amount needed for the skew table */

757 if (skew_mcvs > 0)

758 hash_table_bytes -= skew_mcvs * bytes_per_mcv;

759 }

760 else

761 *num_skew_mcvs = 0;

762

763 /*

764 * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when

765 * memory is filled, assuming a single batch; but limit the value so that

766 * the pointer arrays we'll try to allocate do not exceed hash_table_bytes

767 * nor MaxAllocSize.

768 *

769 * Note that both nbuckets and nbatch must be powers of 2 to make

770 * ExecHashGetBucketAndBatch fast.

771 */

772 max_pointers = hash_table_bytes / sizeof(HashJoinTuple);

773 max_pointers = Min(max_pointers, MaxAllocSize / sizeof(HashJoinTuple));

774 /* If max_pointers isn't a power of 2, must round it down to one */

775 max_pointers = pg_prevpower2_size_t(max_pointers);

776

777 /* Also ensure we avoid integer overflow in nbatch and nbuckets */

778 /* (this step is redundant given the current value of MaxAllocSize) */

779 max_pointers = Min(max_pointers, INT_MAX / 2 + 1);

780

781 dbuckets = ceil(ntuples / NTUP_PER_BUCKET);

782 dbuckets = Min(dbuckets, max_pointers);

783 nbuckets = (int) dbuckets;

784 /* don't let nbuckets be really small, though ... */

785 nbuckets = Max(nbuckets, 1024);

786 /* ... and force it to be a power of 2. */

787 nbuckets = pg_nextpower2_32(nbuckets);

788

789 /*

790 * If there's not enough space to store the projected number of tuples and

791 * the required bucket headers, we will need multiple batches.

792 */

793 bucket_bytes = sizeof(HashJoinTuple) * nbuckets;

794 if (inner_rel_bytes + bucket_bytes > hash_table_bytes)

795 {

796 /* We'll need multiple batches */

797 size_t sbuckets;

798 double dbatch;

799 int minbatch;

800 size_t bucket_size;

801

802 /*

803 * If Parallel Hash with combined hash_mem would still need multiple

804 * batches, we'll have to fall back to regular hash_mem budget.

805 */

806 if (try_combined_hash_mem)

807 {

808 ExecChooseHashTableSize(ntuples, tupwidth, useskew,

809 false, parallel_workers,

810 space_allowed,

811 numbuckets,

812 numbatches,

813 num_skew_mcvs);

814 return;

815 }

816

817 /*

818 * Estimate the number of buckets we'll want to have when hash_mem is

819 * entirely full. Each bucket will contain a bucket pointer plus

820 * NTUP_PER_BUCKET tuples, whose projected size already includes

821 * overhead for the hash code, pointer to the next tuple, etc.

822 */

823 bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));

824 if (hash_table_bytes <= bucket_size)

825 sbuckets = 1; /* avoid pg_nextpower2_size_t(0) */

826 else

827 sbuckets = pg_nextpower2_size_t(hash_table_bytes / bucket_size);

828 sbuckets = Min(sbuckets, max_pointers);

829 nbuckets = (int) sbuckets;

830 nbuckets = pg_nextpower2_32(nbuckets);

831 bucket_bytes = nbuckets * sizeof(HashJoinTuple);

832

833 /*

834 * Buckets are simple pointers to hashjoin tuples, while tupsize

835 * includes the pointer, hash code, and MinimalTupleData. So buckets

836 * should never really exceed 25% of hash_mem (even for

837 * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not

838 * 2^N bytes, where we might get more because of doubling. So let's

839 * look for 50% here.

840 */

841 Assert(bucket_bytes <= hash_table_bytes / 2);

842

843 /* Calculate required number of batches. */

844 dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes));

845 dbatch = Min(dbatch, max_pointers);

846 minbatch = (int) dbatch;

847 nbatch = pg_nextpower2_32(Max(2, minbatch));

848 }

849

850 /*

851 * Optimize the total amount of memory consumed by the hash node.

852 *

853 * The nbatch calculation above focuses on the size of the in-memory hash

854 * table, assuming no per-batch overhead. Now adjust the number of batches

855 * and the size of the hash table to minimize total memory consumed by the

856 * hash node.

857 *

858 * Each batch file has a BLCKSZ buffer, and we may need two files per

859 * batch (inner and outer side). So with enough batches this can be

860 * significantly more memory than the hashtable itself.

861 *

862 * The total memory usage may be expressed by this formula:

863 *

864 * (inner_rel_bytes / nbatch) + (2 * nbatch * BLCKSZ) <= hash_table_bytes

865 *

866 * where (inner_rel_bytes / nbatch) is the size of the in-memory hash

867 * table and (2 * nbatch * BLCKSZ) is the amount of memory used by file

868 * buffers. But for sufficiently large values of inner_rel_bytes value

869 * there may not be a nbatch value that would make both parts fit into

870 * hash_table_bytes.

871 *

872 * In this case we can't enforce the memory limit - we're going to exceed

873 * it. We can however minimize the impact and use as little memory as

874 * possible. (We haven't really enforced it before either, as we simply

875 * ignored the batch files.)

876 *

877 * The formula for total memory usage says that given an inner relation of

878 * size inner_rel_bytes, we may divide it into an arbitrary number of

879 * batches. This determines both the size of the in-memory hash table and

880 * the amount of memory needed for batch files. These two terms work in

881 * opposite ways - when one decreases, the other increases.

882 *

883 * For low nbatch values, the hash table takes most of the memory, but at

884 * some point the batch files start to dominate. If you combine these two

885 * terms, the memory consumption (for a fixed size of the inner relation)

886 * has a u-shape, with a minimum at some nbatch value.

887 *

888 * Our goal is to find this nbatch value, minimizing the memory usage. We

889 * calculate the memory usage with half the batches (i.e. nbatch/2), and

890 * if it's lower than the current memory usage we know it's better to use

891 * fewer batches. We repeat this until reducing the number of batches does

892 * not reduce the memory usage - we found the optimum. We know the optimum

893 * exists, thanks to the u-shape.

894 *

895 * We only want to do this when exceeding the memory limit, not every

896 * time. The goal is not to minimize memory usage in every case, but to

897 * minimize the memory usage when we can't stay within the memory limit.

898 *

899 * For this reason we only consider reducing the number of batches. We

900 * could try the opposite direction too, but that would save memory only

901 * when most of the memory is used by the hash table. And the hash table

902 * was used for the initial sizing, so we shouldn't be exceeding the

903 * memory limit too much. We might save memory by using more batches, but

904 * it would result in spilling more batch files, which does not seem like

905 * a great trade off.

906 *

907 * While growing the hashtable, we also adjust the number of buckets, to

908 * not have more than one tuple per bucket (load factor 1). We can only do

909 * this during the initial sizing - once we start building the hash,

910 * nbucket is fixed.

911 */

912 while (nbatch > 0)

913 {

914 /* how much memory are we using with current nbatch value */

915 size_t current_space = hash_table_bytes + (2 * nbatch * BLCKSZ);

916

917 /* how much memory would we use with half the batches */

918 size_t new_space = hash_table_bytes * 2 + (nbatch * BLCKSZ);

919

920 /* If the memory usage would not decrease, we found the optimum. */

921 if (current_space < new_space)

922 break;

923

924 /*

925 * It's better to use half the batches, so do that and adjust the

926 * nbucket in the opposite direction, and double the allowance.

927 */

928 nbatch /= 2;

929 nbuckets *= 2;

930

931 *space_allowed = (*space_allowed) * 2;

932 }

933

934 Assert(nbuckets > 0);

935 Assert(nbatch > 0);

936

937 *numbuckets = nbuckets;

938 *numbatches = nbatch;

939}

Min

#define Min(x, y)

Definition: c.h:1003

MAXALIGN

#define MAXALIGN(LEN)

Definition: c.h:810

Max

#define Max(x, y)

Definition: c.h:997

HashJoinTuple

struct HashJoinTupleData * HashJoinTuple

Definition: execnodes.h:2250

MaxAllocSize

#define MaxAllocSize

Definition: fe_memutils.h:22

Assert

Assert(PointerIsAligned(start, uint64))

HJTUPLE_OVERHEAD

#define HJTUPLE_OVERHEAD

Definition: hashjoin.h:90

SKEW_BUCKET_OVERHEAD

#define SKEW_BUCKET_OVERHEAD

Definition: hashjoin.h:119

SKEW_HASH_MEM_PERCENT

#define SKEW_HASH_MEM_PERCENT

Definition: hashjoin.h:121

SizeofMinimalTupleHeader

#define SizeofMinimalTupleHeader

Definition: htup_details.h:699

ExecChooseHashTableSize

void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, bool try_combined_hash_mem, int parallel_workers, size_t *space_allowed, int *numbuckets, int *numbatches, int *num_skew_mcvs)

Definition: nodeHash.c:657

NTUP_PER_BUCKET

#define NTUP_PER_BUCKET

Definition: nodeHash.c:654

get_hash_memory_limit

size_t get_hash_memory_limit(void)

Definition: nodeHash.c:3615

pg_nextpower2_32

static uint32 pg_nextpower2_32(uint32 num)

Definition: pg_bitutils.h:189

pg_nextpower2_size_t

#define pg_nextpower2_size_t

Definition: pg_bitutils.h:441

pg_prevpower2_size_t

#define pg_prevpower2_size_t

Definition: pg_bitutils.h:442

HashJoinTupleData

Definition: hashjoin.h:79

HashSkewBucket

Definition: hashjoin.h:114

References Assert(), ExecChooseHashTableSize(), get_hash_memory_limit(), HJTUPLE_OVERHEAD, Max, MAXALIGN, MaxAllocSize, Min, NTUP_PER_BUCKET, pg_nextpower2_32(), pg_nextpower2_size_t, pg_prevpower2_size_t, SizeofMinimalTupleHeader, SKEW_BUCKET_OVERHEAD, and SKEW_HASH_MEM_PERCENT.

Referenced by ExecChooseHashTableSize(), ExecHashTableCreate(), and initial_cost_hashjoin().

◆ ExecEndHash()

void ExecEndHash ( HashState * node )

Definition at line 426 of file nodeHash.c.

427{

428 PlanState *outerPlan;

429

430 /*

431 * shut down the subplan

432 */

433 outerPlan = outerPlanState(node);

434 ExecEndNode(outerPlan);

435}

ExecEndNode

void ExecEndNode(PlanState *node)

Definition: execProcnode.c:562

outerPlanState

#define outerPlanState(node)

Definition: execnodes.h:1255

outerPlan

#define outerPlan(node)

Definition: plannodes.h:252

PlanState

Definition: execnodes.h:1154

References ExecEndNode(), outerPlan, and outerPlanState.

Referenced by ExecEndNode().

◆ ExecHashAccumInstrumentation()

void ExecHashAccumInstrumentation ( HashInstrumentation * instrument,

HashJoinTable hashtable

)

Definition at line 2870 of file nodeHash.c.

2872{

2873 instrument->nbuckets = Max(instrument->nbuckets,

2874 hashtable->nbuckets);

2875 instrument->nbuckets_original = Max(instrument->nbuckets_original,

2876 hashtable->nbuckets_original);

2877 instrument->nbatch = Max(instrument->nbatch,

2878 hashtable->nbatch);

2879 instrument->nbatch_original = Max(instrument->nbatch_original,

2880 hashtable->nbatch_original);

2881 instrument->space_peak = Max(instrument->space_peak,

2882 hashtable->spacePeak);

2883}

HashInstrumentation::nbatch

int nbatch

Definition: execnodes.h:2790

HashInstrumentation::nbatch_original

int nbatch_original

Definition: execnodes.h:2791

HashInstrumentation::nbuckets

int nbuckets

Definition: execnodes.h:2788

HashInstrumentation::nbuckets_original

int nbuckets_original

Definition: execnodes.h:2789

HashInstrumentation::space_peak

Size space_peak

Definition: execnodes.h:2792

HashJoinTableData::nbuckets

int nbuckets

Definition: hashjoin.h:300

HashJoinTableData::nbatch

int nbatch

Definition: hashjoin.h:322

HashJoinTableData::nbuckets_original

int nbuckets_original

Definition: hashjoin.h:303

HashJoinTableData::spacePeak

Size spacePeak

Definition: hashjoin.h:346

HashJoinTableData::nbatch_original

int nbatch_original

Definition: hashjoin.h:325

References Max, HashJoinTableData::nbatch, HashInstrumentation::nbatch, HashJoinTableData::nbatch_original, HashInstrumentation::nbatch_original, HashJoinTableData::nbuckets, HashInstrumentation::nbuckets, HashJoinTableData::nbuckets_original, HashInstrumentation::nbuckets_original, HashInstrumentation::space_peak, and HashJoinTableData::spacePeak.

Referenced by ExecReScanHashJoin(), and ExecShutdownHash().

◆ ExecHashEstimate()

void ExecHashEstimate ( HashState * node,

ParallelContext * pcxt

)

Definition at line 2754 of file nodeHash.c.

2755{

2756 size_t size;

2757

2758 /* don't need this if not instrumenting or no workers */

2759 if (!node->ps.instrument || pcxt->nworkers == 0)

2760 return;

2761

2762 size = mul_size(pcxt->nworkers, sizeof(HashInstrumentation));

2763 size = add_size(size, offsetof(SharedHashInfo, hinstrument));

2764 shm_toc_estimate_chunk(&pcxt->estimator, size);

2765 shm_toc_estimate_keys(&pcxt->estimator, 1);

2766}

shm_toc_estimate_chunk

#define shm_toc_estimate_chunk(e, sz)

Definition: shm_toc.h:51

shm_toc_estimate_keys

#define shm_toc_estimate_keys(e, cnt)

Definition: shm_toc.h:53

add_size

Size add_size(Size s1, Size s2)

Definition: shmem.c:493

mul_size

Size mul_size(Size s1, Size s2)

Definition: shmem.c:510

HashInstrumentation

Definition: execnodes.h:2787

HashState::ps

PlanState ps

Definition: execnodes.h:2811

ParallelContext::estimator

shm_toc_estimator estimator

Definition: parallel.h:41

ParallelContext::nworkers

int nworkers

Definition: parallel.h:35

PlanState::instrument

Instrumentation * instrument

Definition: execnodes.h:1169

SharedHashInfo

Definition: execnodes.h:2800

References add_size(), ParallelContext::estimator, PlanState::instrument, mul_size(), ParallelContext::nworkers, HashState::ps, shm_toc_estimate_chunk, and shm_toc_estimate_keys.

Referenced by ExecParallelEstimate().

◆ ExecHashGetBucketAndBatch()

void ExecHashGetBucketAndBatch ( HashJoinTable hashtable,

uint32 hashvalue,

int * bucketno,

int * batchno

)

Definition at line 1953 of file nodeHash.c.

1957{

1958 uint32 nbuckets = (uint32) hashtable->nbuckets;

1959 uint32 nbatch = (uint32) hashtable->nbatch;

1960

1961 if (nbatch > 1)

1962 {

1963 *bucketno = hashvalue & (nbuckets - 1);

1964 *batchno = pg_rotate_right32(hashvalue,

1965 hashtable->log2_nbuckets) & (nbatch - 1);

1966 }

1967 else

1968 {

1969 *bucketno = hashvalue & (nbuckets - 1);

1970 *batchno = 0;

1971 }

1972}

uint32

uint32_t uint32

Definition: c.h:538

pg_rotate_right32

static uint32 pg_rotate_right32(uint32 word, int n)

Definition: pg_bitutils.h:422

HashJoinTableData::log2_nbuckets

int log2_nbuckets

Definition: hashjoin.h:301

References HashJoinTableData::log2_nbuckets, HashJoinTableData::nbatch, HashJoinTableData::nbuckets, and pg_rotate_right32().

Referenced by ExecHashIncreaseNumBatches(), ExecHashIncreaseNumBuckets(), ExecHashJoinImpl(), ExecHashRemoveNextSkewBucket(), ExecHashTableInsert(), ExecParallelHashIncreaseNumBuckets(), ExecParallelHashJoinPartitionOuter(), ExecParallelHashRepartitionFirst(), ExecParallelHashRepartitionRest(), ExecParallelHashTableInsert(), and ExecParallelHashTableInsertCurrentBatch().

◆ ExecHashGetSkewBucket()

int ExecHashGetSkewBucket ( HashJoinTable hashtable,

uint32 hashvalue

)

Definition at line 2548 of file nodeHash.c.

2549{

2550 int bucket;

2551

2552 /*

2553 * Always return INVALID_SKEW_BUCKET_NO if not doing skew optimization (in

2554 * particular, this happens after the initial batch is done).

2555 */

2556 if (!hashtable->skewEnabled)

2557 return INVALID_SKEW_BUCKET_NO;

2558

2559 /*

2560 * Since skewBucketLen is a power of 2, we can do a modulo by ANDing.

2561 */

2562 bucket = hashvalue & (hashtable->skewBucketLen - 1);

2563

2564 /*

2565 * While we have not hit a hole in the hashtable and have not hit the

2566 * desired bucket, we have collided with some other hash value, so try the

2567 * next bucket location.

2568 */

2569 while (hashtable->skewBucket[bucket] != NULL &&

2570 hashtable->skewBucket[bucket]->hashvalue != hashvalue)

2571 bucket = (bucket + 1) & (hashtable->skewBucketLen - 1);

2572

2573 /*

2574 * Found the desired bucket?

2575 */

2576 if (hashtable->skewBucket[bucket] != NULL)

2577 return bucket;

2578

2579 /*

2580 * There must not be any hashtable entry for this hash value.

2581 */

2582 return INVALID_SKEW_BUCKET_NO;

2583}

INVALID_SKEW_BUCKET_NO

#define INVALID_SKEW_BUCKET_NO

Definition: hashjoin.h:120

HashJoinTableData::skewEnabled

bool skewEnabled

Definition: hashjoin.h:316

HashJoinTableData::skewBucketLen

int skewBucketLen

Definition: hashjoin.h:318

HashJoinTableData::skewBucket

HashSkewBucket ** skewBucket

Definition: hashjoin.h:317

HashSkewBucket::hashvalue

uint32 hashvalue

Definition: hashjoin.h:115

References HashSkewBucket::hashvalue, INVALID_SKEW_BUCKET_NO, HashJoinTableData::skewBucket, HashJoinTableData::skewBucketLen, and HashJoinTableData::skewEnabled.

Referenced by ExecHashJoinImpl(), and MultiExecPrivateHash().

◆ ExecHashInitializeDSM()

void ExecHashInitializeDSM ( HashState * node,

ParallelContext * pcxt

)

Definition at line 2773 of file nodeHash.c.

2774{

2775 size_t size;

2776

2777 /* don't need this if not instrumenting or no workers */

2778 if (!node->ps.instrument || pcxt->nworkers == 0)

2779 return;

2780

2781 size = offsetof(SharedHashInfo, hinstrument) +

2782 pcxt->nworkers * sizeof(HashInstrumentation);

2783 node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size);

2784

2785 /* Each per-worker area must start out as zeroes. */

2786 memset(node->shared_info, 0, size);

2787

2788 node->shared_info->num_workers = pcxt->nworkers;

2789 shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id,

2790 node->shared_info);

2791}

HashInstrumentation

struct HashInstrumentation HashInstrumentation

shm_toc_allocate

void * shm_toc_allocate(shm_toc *toc, Size nbytes)

Definition: shm_toc.c:88

shm_toc_insert

void shm_toc_insert(shm_toc *toc, uint64 key, void *address)

Definition: shm_toc.c:171

HashState::shared_info

SharedHashInfo * shared_info

Definition: execnodes.h:2824

ParallelContext::toc

shm_toc * toc

Definition: parallel.h:44

PlanState::plan

Plan * plan

Definition: execnodes.h:1159

Plan::plan_node_id

int plan_node_id

Definition: plannodes.h:218

SharedHashInfo::num_workers

int num_workers

Definition: execnodes.h:2801

References PlanState::instrument, SharedHashInfo::num_workers, ParallelContext::nworkers, PlanState::plan, Plan::plan_node_id, HashState::ps, HashState::shared_info, shm_toc_allocate(), shm_toc_insert(), and ParallelContext::toc.

Referenced by ExecParallelInitializeDSM().

◆ ExecHashInitializeWorker()

void ExecHashInitializeWorker ( HashState * node,

ParallelWorkerContext * pwcxt

)

Definition at line 2798 of file nodeHash.c.

2799{

2800 SharedHashInfo *shared_info;

2801

2802 /* don't need this if not instrumenting */

2803 if (!node->ps.instrument)

2804 return;

2805

2806 /*

2807 * Find our entry in the shared area, and set up a pointer to it so that

2808 * we'll accumulate stats there when shutting down or rebuilding the hash

2809 * table.

2810 */

2811 shared_info = (SharedHashInfo *)

2812 shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false);

2813 node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber];

2814}

ParallelWorkerNumber

int ParallelWorkerNumber

Definition: parallel.c:115

shm_toc_lookup

void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)

Definition: shm_toc.c:232

HashState::hinstrument

HashInstrumentation * hinstrument

Definition: execnodes.h:2831

ParallelWorkerContext::toc

shm_toc * toc

Definition: parallel.h:53

SharedHashInfo::hinstrument

HashInstrumentation hinstrument[FLEXIBLE_ARRAY_MEMBER]

Definition: execnodes.h:2802

References SharedHashInfo::hinstrument, HashState::hinstrument, PlanState::instrument, ParallelWorkerNumber, PlanState::plan, Plan::plan_node_id, HashState::ps, shm_toc_lookup(), and ParallelWorkerContext::toc.

Referenced by ExecParallelInitializeWorker().

◆ ExecHashRetrieveInstrumentation()

void ExecHashRetrieveInstrumentation ( HashState * node )

Definition at line 2839 of file nodeHash.c.

2840{

2841 SharedHashInfo *shared_info = node->shared_info;

2842 size_t size;

2843

2844 if (shared_info == NULL)

2845 return;

2846

2847 /* Replace node->shared_info with a copy in backend-local memory. */

2848 size = offsetof(SharedHashInfo, hinstrument) +

2849 shared_info->num_workers * sizeof(HashInstrumentation);

2850 node->shared_info = palloc(size);

2851 memcpy(node->shared_info, shared_info, size);

2852}

palloc

void * palloc(Size size)

Definition: mcxt.c:1365

References SharedHashInfo::num_workers, palloc(), and HashState::shared_info.

Referenced by ExecParallelRetrieveInstrumentation().

◆ ExecHashTableCreate()

HashJoinTable ExecHashTableCreate ( HashState * state )

Definition at line 445 of file nodeHash.c.

446{

447 Hash *node;

448 HashJoinTable hashtable;

449 Plan *outerNode;

450 size_t space_allowed;

451 int nbuckets;

452 int nbatch;

453 double rows;

454 int num_skew_mcvs;

455 int log2_nbuckets;

456 MemoryContext oldcxt;

457

458 /*

459 * Get information about the size of the relation to be hashed (it's the

460 * "outer" subtree of this node, but the inner relation of the hashjoin).

461 * Compute the appropriate size of the hash table.

462 */

463 node = (Hash *) state->ps.plan;

464 outerNode = outerPlan(node);

465

466 /*

467 * If this is shared hash table with a partial plan, then we can't use

468 * outerNode->plan_rows to estimate its size. We need an estimate of the

469 * total number of rows across all copies of the partial plan.

470 */

471 rows = node->plan.parallel_aware ? node->rows_total : outerNode->plan_rows;

472

473 ExecChooseHashTableSize(rows, outerNode->plan_width,

474 OidIsValid(node->skewTable),

475 state->parallel_state != NULL,

476 state->parallel_state != NULL ?

477 state->parallel_state->nparticipants - 1 : 0,

478 &space_allowed,

479 &nbuckets, &nbatch, &num_skew_mcvs);

480

481 /* nbuckets must be a power of 2 */

482 log2_nbuckets = pg_ceil_log2_32(nbuckets);

483 Assert(nbuckets == (1 << log2_nbuckets));

484

485 /*

486 * Initialize the hash table control block.

487 *

488 * The hashtable control block is just palloc'd from the executor's

489 * per-query memory context. Everything else should be kept inside the

490 * subsidiary hashCxt, batchCxt or spillCxt.

491 */

492 hashtable = palloc_object(HashJoinTableData);

493 hashtable->nbuckets = nbuckets;

494 hashtable->nbuckets_original = nbuckets;

495 hashtable->nbuckets_optimal = nbuckets;

496 hashtable->log2_nbuckets = log2_nbuckets;

497 hashtable->log2_nbuckets_optimal = log2_nbuckets;

498 hashtable->buckets.unshared = NULL;

499 hashtable->skewEnabled = false;

500 hashtable->skewBucket = NULL;

501 hashtable->skewBucketLen = 0;

502 hashtable->nSkewBuckets = 0;

503 hashtable->skewBucketNums = NULL;

504 hashtable->nbatch = nbatch;

505 hashtable->curbatch = 0;

506 hashtable->nbatch_original = nbatch;

507 hashtable->nbatch_outstart = nbatch;

508 hashtable->growEnabled = true;

509 hashtable->totalTuples = 0;

510 hashtable->partialTuples = 0;

511 hashtable->skewTuples = 0;

512 hashtable->innerBatchFile = NULL;

513 hashtable->outerBatchFile = NULL;

514 hashtable->spaceUsed = 0;

515 hashtable->spacePeak = 0;

516 hashtable->spaceAllowed = space_allowed;

517 hashtable->spaceUsedSkew = 0;

518 hashtable->spaceAllowedSkew =

519 hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100;

520 hashtable->chunks = NULL;

521 hashtable->current_chunk = NULL;

522 hashtable->parallel_state = state->parallel_state;

523 hashtable->area = state->ps.state->es_query_dsa;

524 hashtable->batches = NULL;

525

526#ifdef HJDEBUG

527 printf("Hashjoin %p: initial nbatch = %d, nbuckets = %d\n",

528 hashtable, nbatch, nbuckets);

529#endif

530

531 /*

532 * Create temporary memory contexts in which to keep the hashtable working

533 * storage. See notes in executor/hashjoin.h.

534 */

535 hashtable->hashCxt = AllocSetContextCreate(CurrentMemoryContext,

536 "HashTableContext",

537 ALLOCSET_DEFAULT_SIZES);

538

539 hashtable->batchCxt = AllocSetContextCreate(hashtable->hashCxt,

540 "HashBatchContext",

541 ALLOCSET_DEFAULT_SIZES);

542

543 hashtable->spillCxt = AllocSetContextCreate(hashtable->hashCxt,

544 "HashSpillContext",

545 ALLOCSET_DEFAULT_SIZES);

546

547 /* Allocate data that will live for the life of the hashjoin */

548

549 oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);

550

551 if (nbatch > 1 && hashtable->parallel_state == NULL)

552 {

553 MemoryContext oldctx;

554

555 /*

556 * allocate and initialize the file arrays in hashCxt (not needed for

557 * parallel case which uses shared tuplestores instead of raw files)

558 */

559 oldctx = MemoryContextSwitchTo(hashtable->spillCxt);

560

561 hashtable->innerBatchFile = palloc0_array(BufFile *, nbatch);

562 hashtable->outerBatchFile = palloc0_array(BufFile *, nbatch);

563

564 MemoryContextSwitchTo(oldctx);

565

566 /* The files will not be opened until needed... */

567 /* ... but make sure we have temp tablespaces established for them */

568 PrepareTempTablespaces();

569 }

570

571 MemoryContextSwitchTo(oldcxt);

572

573 if (hashtable->parallel_state)

574 {

575 ParallelHashJoinState *pstate = hashtable->parallel_state;

576 Barrier *build_barrier;

577

578 /*

579 * Attach to the build barrier. The corresponding detach operation is

580 * in ExecHashTableDetach. Note that we won't attach to the

581 * batch_barrier for batch 0 yet. We'll attach later and start it out

582 * in PHJ_BATCH_PROBE phase, because batch 0 is allocated up front and

583 * then loaded while hashing (the standard hybrid hash join

584 * algorithm), and we'll coordinate that using build_barrier.

585 */

586 build_barrier = &pstate->build_barrier;

587 BarrierAttach(build_barrier);

588

589 /*

590 * So far we have no idea whether there are any other participants,

591 * and if so, what phase they are working on. The only thing we care

592 * about at this point is whether someone has already created the

593 * SharedHashJoinBatch objects and the hash table for batch 0. One

594 * backend will be elected to do that now if necessary.

595 */

596 if (BarrierPhase(build_barrier) == PHJ_BUILD_ELECT &&

597 BarrierArriveAndWait(build_barrier, WAIT_EVENT_HASH_BUILD_ELECT))

598 {

599 pstate->nbatch = nbatch;

600 pstate->space_allowed = space_allowed;

601 pstate->growth = PHJ_GROWTH_OK;

602

603 /* Set up the shared state for coordinating batches. */

604 ExecParallelHashJoinSetUpBatches(hashtable, nbatch);

605

606 /*

607 * Allocate batch 0's hash table up front so we can load it

608 * directly while hashing.

609 */

610 pstate->nbuckets = nbuckets;

611 ExecParallelHashTableAlloc(hashtable, 0);

612 }

613

614 /*

615 * The next Parallel Hash synchronization point is in

616 * MultiExecParallelHash(), which will progress it all the way to

617 * PHJ_BUILD_RUN. The caller must not return control from this

618 * executor node between now and then.

619 */

620 }

621 else

622 {

623 /*

624 * Prepare context for the first-scan space allocations; allocate the

625 * hashbucket array therein, and set each bucket "empty".

626 */

627 MemoryContextSwitchTo(hashtable->batchCxt);

628

629 hashtable->buckets.unshared = palloc0_array(HashJoinTuple, nbuckets);

630

631 /*

632 * Set up for skew optimization, if possible and there's a need for

633 * more than one batch. (In a one-batch join, there's no point in

634 * it.)

635 */

636 if (nbatch > 1)

637 ExecHashBuildSkewHash(state, hashtable, node, num_skew_mcvs);

638

639 MemoryContextSwitchTo(oldcxt);

640 }

641

642 return hashtable;

643}

PrepareTempTablespaces

void PrepareTempTablespaces(void)

Definition: tablespace.c:1331

BarrierAttach

int BarrierAttach(Barrier *barrier)

Definition: barrier.c:236

BarrierPhase

int BarrierPhase(Barrier *barrier)

Definition: barrier.c:265

BarrierArriveAndWait

bool BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info)

Definition: barrier.c:125

OidIsValid

#define OidIsValid(objectId)

Definition: c.h:774

palloc_object

#define palloc_object(type)

Definition: fe_memutils.h:74

palloc0_array

#define palloc0_array(type, count)

Definition: fe_memutils.h:77

PHJ_GROWTH_OK

@ PHJ_GROWTH_OK

Definition: hashjoin.h:233

PHJ_BUILD_ELECT

#define PHJ_BUILD_ELECT

Definition: hashjoin.h:269

CurrentMemoryContext

MemoryContext CurrentMemoryContext

Definition: mcxt.c:160

AllocSetContextCreate

#define AllocSetContextCreate

Definition: memutils.h:129

ALLOCSET_DEFAULT_SIZES

#define ALLOCSET_DEFAULT_SIZES

Definition: memutils.h:160

ExecHashBuildSkewHash

static void ExecHashBuildSkewHash(HashState *hashstate, HashJoinTable hashtable, Hash *node, int mcvsToUse)

Definition: nodeHash.c:2396

ExecParallelHashJoinSetUpBatches

static void ExecParallelHashJoinSetUpBatches(HashJoinTable hashtable, int nbatch)

Definition: nodeHash.c:3117

ExecParallelHashTableAlloc

void ExecParallelHashTableAlloc(HashJoinTable hashtable, int batchno)

Definition: nodeHash.c:3282

MemoryContextSwitchTo

static MemoryContext MemoryContextSwitchTo(MemoryContext context)

Definition: palloc.h:124

pg_ceil_log2_32

static uint32 pg_ceil_log2_32(uint32 num)

Definition: pg_bitutils.h:258

printf

#define printf(...)

Definition: port.h:245

Barrier

Definition: barrier.h:26

BufFile

Definition: buffile.c:71

HashJoinTableData

Definition: hashjoin.h:299

HashJoinTableData::unshared

struct HashJoinTupleData ** unshared

Definition: hashjoin.h:311

HashJoinTableData::chunks

HashMemoryChunk chunks

Definition: hashjoin.h:355

HashJoinTableData::spaceUsed

Size spaceUsed

Definition: hashjoin.h:344

HashJoinTableData::batches

ParallelHashJoinBatchAccessor * batches

Definition: hashjoin.h:361

HashJoinTableData::curbatch

int curbatch

Definition: hashjoin.h:323

HashJoinTableData::hashCxt

MemoryContext hashCxt

Definition: hashjoin.h:350

HashJoinTableData::buckets

union HashJoinTableData::@110 buckets

HashJoinTableData::totalTuples

double totalTuples

Definition: hashjoin.h:330

HashJoinTableData::partialTuples

double partialTuples

Definition: hashjoin.h:331

HashJoinTableData::parallel_state

ParallelHashJoinState * parallel_state

Definition: hashjoin.h:360

HashJoinTableData::spillCxt

MemoryContext spillCxt

Definition: hashjoin.h:352

HashJoinTableData::current_chunk

HashMemoryChunk current_chunk

Definition: hashjoin.h:358

HashJoinTableData::growEnabled

bool growEnabled

Definition: hashjoin.h:328

HashJoinTableData::nSkewBuckets

int nSkewBuckets

Definition: hashjoin.h:319

HashJoinTableData::nbuckets_optimal

int nbuckets_optimal

Definition: hashjoin.h:304

HashJoinTableData::spaceAllowedSkew

Size spaceAllowedSkew

Definition: hashjoin.h:348

HashJoinTableData::skewBucketNums

int * skewBucketNums

Definition: hashjoin.h:320

HashJoinTableData::innerBatchFile

BufFile ** innerBatchFile

Definition: hashjoin.h:341

HashJoinTableData::spaceUsedSkew

Size spaceUsedSkew

Definition: hashjoin.h:347

HashJoinTableData::spaceAllowed

Size spaceAllowed

Definition: hashjoin.h:345

HashJoinTableData::log2_nbuckets_optimal

int log2_nbuckets_optimal

Definition: hashjoin.h:305

HashJoinTableData::area

dsa_area * area

Definition: hashjoin.h:359

HashJoinTableData::nbatch_outstart

int nbatch_outstart

Definition: hashjoin.h:326

HashJoinTableData::outerBatchFile

BufFile ** outerBatchFile

Definition: hashjoin.h:342

HashJoinTableData::batchCxt

MemoryContext batchCxt

Definition: hashjoin.h:351

HashJoinTableData::skewTuples

double skewTuples

Definition: hashjoin.h:332

Hash

Definition: plannodes.h:1394

Hash::skewTable

Oid skewTable

Definition: plannodes.h:1404

Hash::rows_total

Cardinality rows_total

Definition: plannodes.h:1411

Hash::plan

Plan plan

Definition: plannodes.h:1395

MemoryContextData

Definition: memnodes.h:118

ParallelHashJoinState

Definition: hashjoin.h:247

ParallelHashJoinState::space_allowed

size_t space_allowed

Definition: hashjoin.h:256

ParallelHashJoinState::growth

ParallelHashGrowth growth

Definition: hashjoin.h:253

ParallelHashJoinState::nbuckets

int nbuckets

Definition: hashjoin.h:252

ParallelHashJoinState::build_barrier

Barrier build_barrier

Definition: hashjoin.h:260

ParallelHashJoinState::nbatch

int nbatch

Definition: hashjoin.h:250

Plan

Definition: plannodes.h:177

Plan::parallel_aware

bool parallel_aware

Definition: plannodes.h:204

Plan::plan_width

int plan_width

Definition: plannodes.h:198

Plan::plan_rows

Cardinality plan_rows

Definition: plannodes.h:196

state

Definition: regguts.h:323

Referenced by ExecHashJoinImpl().

◆ ExecHashTableDestroy()

void ExecHashTableDestroy ( HashJoinTable hashtable )

Definition at line 949 of file nodeHash.c.

950{

951 int i;

952

953 /*

954 * Make sure all the temp files are closed. We skip batch 0, since it

955 * can't have any temp files (and the arrays might not even exist if

956 * nbatch is only 1). Parallel hash joins don't use these files.

957 */

958 if (hashtable->innerBatchFile != NULL)

959 {

960 for (i = 1; i < hashtable->nbatch; i++)

961 {

962 if (hashtable->innerBatchFile[i])

963 BufFileClose(hashtable->innerBatchFile[i]);

964 if (hashtable->outerBatchFile[i])

965 BufFileClose(hashtable->outerBatchFile[i]);

966 }

967 }

968

969 /* Release working memory (batchCxt is a child, so it goes away too) */

970 MemoryContextDelete(hashtable->hashCxt);

971

972 /* And drop the control block */

973 pfree(hashtable);

974}

BufFileClose

void BufFileClose(BufFile *file)

Definition: buffile.c:412

i

int i

Definition: isn.c:77

pfree

void pfree(void *pointer)

Definition: mcxt.c:1594

MemoryContextDelete

void MemoryContextDelete(MemoryContext context)

Definition: mcxt.c:469

References BufFileClose(), HashJoinTableData::hashCxt, i, HashJoinTableData::innerBatchFile, MemoryContextDelete(), HashJoinTableData::nbatch, HashJoinTableData::outerBatchFile, and pfree().

Referenced by ExecEndHashJoin(), and ExecReScanHashJoin().

◆ ExecHashTableDetach()

void ExecHashTableDetach ( HashJoinTable hashtable )

Definition at line 3394 of file nodeHash.c.

3395{

3396 ParallelHashJoinState *pstate = hashtable->parallel_state;

3397

3398 /*

3399 * If we're involved in a parallel query, we must either have gotten all

3400 * the way to PHJ_BUILD_RUN, or joined too late and be in PHJ_BUILD_FREE.

3401 */

3402 Assert(!pstate ||

3403 BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUN);

3404

3405 if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUN)

3406 {

3407 int i;

3408

3409 /* Make sure any temporary files are closed. */

3410 if (hashtable->batches)

3411 {

3412 for (i = 0; i < hashtable->nbatch; ++i)

3413 {

3414 sts_end_write(hashtable->batches[i].inner_tuples);

3415 sts_end_write(hashtable->batches[i].outer_tuples);

3416 sts_end_parallel_scan(hashtable->batches[i].inner_tuples);

3417 sts_end_parallel_scan(hashtable->batches[i].outer_tuples);

3418 }

3419 }

3420

3421 /* If we're last to detach, clean up shared memory. */

3422 if (BarrierArriveAndDetach(&pstate->build_barrier))

3423 {

3424 /*

3425 * Late joining processes will see this state and give up

3426 * immediately.

3427 */

3428 Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_FREE);

3429

3430 if (DsaPointerIsValid(pstate->batches))

3431 {

3432 dsa_free(hashtable->area, pstate->batches);

3433 pstate->batches = InvalidDsaPointer;

3434 }

3435 }

3436 }

3437 hashtable->parallel_state = NULL;

3438}

BarrierArriveAndDetach

bool BarrierArriveAndDetach(Barrier *barrier)

Definition: barrier.c:203

dsa_free

void dsa_free(dsa_area *area, dsa_pointer dp)

Definition: dsa.c:841

InvalidDsaPointer

#define InvalidDsaPointer

Definition: dsa.h:78

DsaPointerIsValid

#define DsaPointerIsValid(x)

Definition: dsa.h:106

PHJ_BUILD_FREE

#define PHJ_BUILD_FREE

Definition: hashjoin.h:274

PHJ_BUILD_RUN

#define PHJ_BUILD_RUN

Definition: hashjoin.h:273

sts_end_write

void sts_end_write(SharedTuplestoreAccessor *accessor)

Definition: sharedtuplestore.c:213

sts_end_parallel_scan

void sts_end_parallel_scan(SharedTuplestoreAccessor *accessor)

Definition: sharedtuplestore.c:281

ParallelHashJoinBatchAccessor::outer_tuples

SharedTuplestoreAccessor * outer_tuples

Definition: hashjoin.h:221

ParallelHashJoinBatchAccessor::inner_tuples

SharedTuplestoreAccessor * inner_tuples

Definition: hashjoin.h:220

ParallelHashJoinState::batches

dsa_pointer batches

Definition: hashjoin.h:248

References HashJoinTableData::area, Assert(), BarrierArriveAndDetach(), BarrierPhase(), ParallelHashJoinState::batches, HashJoinTableData::batches, ParallelHashJoinState::build_barrier, dsa_free(), DsaPointerIsValid, i, ParallelHashJoinBatchAccessor::inner_tuples, InvalidDsaPointer, HashJoinTableData::nbatch, ParallelHashJoinBatchAccessor::outer_tuples, HashJoinTableData::parallel_state, PHJ_BUILD_FREE, PHJ_BUILD_RUN, sts_end_parallel_scan(), and sts_end_write().

Referenced by ExecHashJoinReInitializeDSM(), and ExecShutdownHashJoin().

◆ ExecHashTableDetachBatch()

void ExecHashTableDetachBatch ( HashJoinTable hashtable )

Definition at line 3302 of file nodeHash.c.

3303{

3304 if (hashtable->parallel_state != NULL &&

3305 hashtable->curbatch >= 0)

3306 {

3307 int curbatch = hashtable->curbatch;

3308 ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;

3309 bool attached = true;

3310

3311 /* Make sure any temporary files are closed. */

3312 sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples);

3313 sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples);

3314

3315 /* After attaching we always get at least to PHJ_BATCH_PROBE. */

3316 Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE ||

3317 BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_SCAN);

3318

3319 /*

3320 * If we're abandoning the PHJ_BATCH_PROBE phase early without having

3321 * reached the end of it, it means the plan doesn't want any more

3322 * tuples, and it is happy to abandon any tuples buffered in this

3323 * process's subplans. For correctness, we can't allow any process to

3324 * execute the PHJ_BATCH_SCAN phase, because we will never have the

3325 * complete set of match bits. Therefore we skip emitting unmatched

3326 * tuples in all backends (if this is a full/right join), as if those

3327 * tuples were all due to be emitted by this process and it has

3328 * abandoned them too.

3329 */

3330 if (BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE &&

3331 !hashtable->batches[curbatch].outer_eof)

3332 {

3333 /*

3334 * This flag may be written to by multiple backends during

3335 * PHJ_BATCH_PROBE phase, but will only be read in PHJ_BATCH_SCAN

3336 * phase so requires no extra locking.

3337 */

3338 batch->skip_unmatched = true;

3339 }

3340

3341 /*

3342 * Even if we aren't doing a full/right outer join, we'll step through

3343 * the PHJ_BATCH_SCAN phase just to maintain the invariant that

3344 * freeing happens in PHJ_BATCH_FREE, but that'll be wait-free.

3345 */

3346 if (BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE)

3347 attached = BarrierArriveAndDetachExceptLast(&batch->batch_barrier);

3348 if (attached && BarrierArriveAndDetach(&batch->batch_barrier))

3349 {

3350 /*

3351 * We are not longer attached to the batch barrier, but we're the

3352 * process that was chosen to free resources and it's safe to

3353 * assert the current phase. The ParallelHashJoinBatch can't go

3354 * away underneath us while we are attached to the build barrier,

3355 * making this access safe.

3356 */

3357 Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_FREE);

3358

3359 /* Free shared chunks and buckets. */

3360 while (DsaPointerIsValid(batch->chunks))

3361 {

3362 HashMemoryChunk chunk =

3363 dsa_get_address(hashtable->area, batch->chunks);

3364 dsa_pointer next = chunk->next.shared;

3365

3366 dsa_free(hashtable->area, batch->chunks);

3367 batch->chunks = next;

3368 }

3369 if (DsaPointerIsValid(batch->buckets))

3370 {

3371 dsa_free(hashtable->area, batch->buckets);

3372 batch->buckets = InvalidDsaPointer;

3373 }

3374 }

3375

3376 /*

3377 * Track the largest batch we've been attached to. Though each

3378 * backend might see a different subset of batches, explain.c will

3379 * scan the results from all backends to find the largest value.

3380 */

3381 hashtable->spacePeak =

3382 Max(hashtable->spacePeak,

3383 batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets);

3384

3385 /* Remember that we are not attached to a batch. */

3386 hashtable->curbatch = -1;

3387 }

3388}

BarrierArriveAndDetachExceptLast

bool BarrierArriveAndDetachExceptLast(Barrier *barrier)

Definition: barrier.c:213

Definition: blutils.c:224

dsa_get_address

void * dsa_get_address(dsa_area *area, dsa_pointer dp)

Definition: dsa.c:957

dsa_pointer

uint64 dsa_pointer

Definition: dsa.h:62

PHJ_BATCH_SCAN

#define PHJ_BATCH_SCAN

Definition: hashjoin.h:281

PHJ_BATCH_PROBE

#define PHJ_BATCH_PROBE

Definition: hashjoin.h:280

PHJ_BATCH_FREE

#define PHJ_BATCH_FREE

Definition: hashjoin.h:282

HashMemoryChunkData

Definition: hashjoin.h:129

HashMemoryChunkData::next

union HashMemoryChunkData::@109 next

HashMemoryChunkData::shared

dsa_pointer shared

Definition: hashjoin.h:138

ParallelHashJoinBatchAccessor::shared

ParallelHashJoinBatch * shared

Definition: hashjoin.h:209

ParallelHashJoinBatchAccessor::outer_eof

bool outer_eof

Definition: hashjoin.h:218

ParallelHashJoinBatch

Definition: hashjoin.h:163

ParallelHashJoinBatch::batch_barrier

Barrier batch_barrier

Definition: hashjoin.h:165

ParallelHashJoinBatch::chunks

dsa_pointer chunks

Definition: hashjoin.h:167

ParallelHashJoinBatch::skip_unmatched

bool skip_unmatched

Definition: hashjoin.h:173

ParallelHashJoinBatch::buckets

dsa_pointer buckets

Definition: hashjoin.h:164

ParallelHashJoinBatch::size

size_t size

Definition: hashjoin.h:168

pg_atomic_uint64

Definition: fallback.h:27

Referenced by ExecHashJoinReInitializeDSM(), ExecParallelHashJoinNewBatch(), ExecParallelPrepHashTableForUnmatched(), and ExecShutdownHashJoin().

◆ ExecHashTableInsert()

void ExecHashTableInsert ( HashJoinTable hashtable,

TupleTableSlot * slot,

uint32 hashvalue

)

Definition at line 1742 of file nodeHash.c.

1745{

1746 bool shouldFree;

1747 MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);

1748 int bucketno;

1749 int batchno;

1750

1751 ExecHashGetBucketAndBatch(hashtable, hashvalue,

1752 &bucketno, &batchno);

1753

1754 /*

1755 * decide whether to put the tuple in the hash table or a temp file

1756 */

1757 if (batchno == hashtable->curbatch)

1758 {

1759 /*

1760 * put the tuple in hash table

1761 */

1762 HashJoinTuple hashTuple;

1763 int hashTupleSize;

1764 double ntuples = (hashtable->totalTuples - hashtable->skewTuples);

1765

1766 /* Create the HashJoinTuple */

1767 hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;

1768 hashTuple = (HashJoinTuple) dense_alloc(hashtable, hashTupleSize);

1769

1770 hashTuple->hashvalue = hashvalue;

1771 memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);

1772

1773 /*

1774 * We always reset the tuple-matched flag on insertion. This is okay

1775 * even when reloading a tuple from a batch file, since the tuple

1776 * could not possibly have been matched to an outer tuple before it

1777 * went into the batch file.

1778 */

1779 HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));

1780

1781 /* Push it onto the front of the bucket's list */

1782 hashTuple->next.unshared = hashtable->buckets.unshared[bucketno];

1783 hashtable->buckets.unshared[bucketno] = hashTuple;

1784

1785 /*

1786 * Increase the (optimal) number of buckets if we just exceeded the

1787 * NTUP_PER_BUCKET threshold, but only when there's still a single

1788 * batch.

1789 */

1790 if (hashtable->nbatch == 1 &&

1791 ntuples > (hashtable->nbuckets_optimal * NTUP_PER_BUCKET))

1792 {

1793 /* Guard against integer overflow and alloc size overflow */

1794 if (hashtable->nbuckets_optimal <= INT_MAX / 2 &&

1795 hashtable->nbuckets_optimal * 2 <= MaxAllocSize / sizeof(HashJoinTuple))

1796 {

1797 hashtable->nbuckets_optimal *= 2;

1798 hashtable->log2_nbuckets_optimal += 1;

1799 }

1800 }

1801

1802 /* Account for space used, and back off if we've used too much */

1803 hashtable->spaceUsed += hashTupleSize;

1804 if (hashtable->spaceUsed > hashtable->spacePeak)

1805 hashtable->spacePeak = hashtable->spaceUsed;

1806 if (hashtable->spaceUsed +

1807 hashtable->nbuckets_optimal * sizeof(HashJoinTuple)

1808 > hashtable->spaceAllowed)

1809 ExecHashIncreaseNumBatches(hashtable);

1810 }

1811 else

1812 {

1813 /*

1814 * put the tuple into a temp file for later batches

1815 */

1816 Assert(batchno > hashtable->curbatch);

1817 ExecHashJoinSaveTuple(tuple,

1818 hashvalue,

1819 &hashtable->innerBatchFile[batchno],

1820 hashtable);

1821 }

1822

1823 if (shouldFree)

1824 heap_free_minimal_tuple(tuple);

1825}

ExecFetchSlotMinimalTuple

MinimalTuple ExecFetchSlotMinimalTuple(TupleTableSlot *slot, bool *shouldFree)

Definition: execTuples.c:1881

HJTUPLE_MINTUPLE

#define HJTUPLE_MINTUPLE(hjtup)

Definition: hashjoin.h:91

heap_free_minimal_tuple

void heap_free_minimal_tuple(MinimalTuple mtup)

Definition: heaptuple.c:1530

HeapTupleHeaderClearMatch

static void HeapTupleHeaderClearMatch(MinimalTupleData *tup)

Definition: htup_details.h:718

dense_alloc

static void * dense_alloc(HashJoinTable hashtable, Size size)

Definition: nodeHash.c:2889

ExecHashIncreaseNumBatches

static void ExecHashIncreaseNumBatches(HashJoinTable hashtable)

Definition: nodeHash.c:1023

ExecHashGetBucketAndBatch

void ExecHashGetBucketAndBatch(HashJoinTable hashtable, uint32 hashvalue, int *bucketno, int *batchno)

Definition: nodeHash.c:1953

ExecHashJoinSaveTuple

void ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue, BufFile **fileptr, HashJoinTable hashtable)

Definition: nodeHashjoin.c:1414

HashJoinTupleData::next

union HashJoinTupleData::@108 next

HashJoinTupleData::hashvalue

uint32 hashvalue

Definition: hashjoin.h:86

HashJoinTupleData::unshared

struct HashJoinTupleData * unshared

Definition: hashjoin.h:83

MinimalTupleData

Definition: htup_details.h:677

MinimalTupleData::t_len

uint32 t_len

Definition: htup_details.h:678

Referenced by ExecHashJoinNewBatch(), and MultiExecPrivateHash().

◆ ExecHashTableReset()

void ExecHashTableReset ( HashJoinTable hashtable )

Definition at line 2320 of file nodeHash.c.

2321{

2322 MemoryContext oldcxt;

2323 int nbuckets = hashtable->nbuckets;

2324

2325 /*

2326 * Release all the hash buckets and tuples acquired in the prior pass, and

2327 * reinitialize the context for a new pass.

2328 */

2329 MemoryContextReset(hashtable->batchCxt);

2330 oldcxt = MemoryContextSwitchTo(hashtable->batchCxt);

2331

2332 /* Reallocate and reinitialize the hash bucket headers. */

2333 hashtable->buckets.unshared = palloc0_array(HashJoinTuple, nbuckets);

2334

2335 hashtable->spaceUsed = 0;

2336

2337 MemoryContextSwitchTo(oldcxt);

2338

2339 /* Forget the chunks (the memory was freed by the context reset above). */

2340 hashtable->chunks = NULL;

2341}

MemoryContextReset

void MemoryContextReset(MemoryContext context)

Definition: mcxt.c:400

References HashJoinTableData::batchCxt, HashJoinTableData::buckets, HashJoinTableData::chunks, MemoryContextReset(), MemoryContextSwitchTo(), HashJoinTableData::nbuckets, palloc0_array, HashJoinTableData::spaceUsed, and HashJoinTableData::unshared.

Referenced by ExecHashJoinNewBatch().

◆ ExecHashTableResetMatchFlags()

void ExecHashTableResetMatchFlags ( HashJoinTable hashtable )

Definition at line 2348 of file nodeHash.c.

2349{

2350 HashJoinTuple tuple;

2351 int i;

2352

2353 /* Reset all flags in the main table ... */

2354 for (i = 0; i < hashtable->nbuckets; i++)

2355 {

2356 for (tuple = hashtable->buckets.unshared[i]; tuple != NULL;

2357 tuple = tuple->next.unshared)

2358 HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));

2359 }

2360

2361 /* ... and the same for the skew buckets, if any */

2362 for (i = 0; i < hashtable->nSkewBuckets; i++)

2363 {

2364 int j = hashtable->skewBucketNums[i];

2365 HashSkewBucket *skewBucket = hashtable->skewBucket[j];

2366

2367 for (tuple = skewBucket->tuples; tuple != NULL; tuple = tuple->next.unshared)

2368 HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(tuple));

2369 }

2370}

j

int j

Definition: isn.c:78

HashSkewBucket::tuples

HashJoinTuple tuples

Definition: hashjoin.h:116

References HashJoinTableData::buckets, HeapTupleHeaderClearMatch(), HJTUPLE_MINTUPLE, i, j, HashJoinTableData::nbuckets, HashJoinTupleData::next, HashJoinTableData::nSkewBuckets, HashJoinTableData::skewBucket, HashJoinTableData::skewBucketNums, HashSkewBucket::tuples, HashJoinTupleData::unshared, and HashJoinTableData::unshared.

Referenced by ExecReScanHashJoin().

◆ ExecInitHash()

HashState * ExecInitHash ( Hash * node,

EState * estate,

int eflags

)

Definition at line 369 of file nodeHash.c.

370{

371 HashState *hashstate;

372

373 /* check for unsupported flags */

374 Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));

375

376 /*

377 * create state structure

378 */

379 hashstate = makeNode(HashState);

380 hashstate->ps.plan = (Plan *) node;

381 hashstate->ps.state = estate;

382 hashstate->ps.ExecProcNode = ExecHash;

383 /* delay building hashtable until ExecHashTableCreate() in executor run */

384 hashstate->hashtable = NULL;

385

386 /*

387 * Miscellaneous initialization

388 *

389 * create expression context for node

390 */

391 ExecAssignExprContext(estate, &hashstate->ps);

392

393 /*

394 * initialize child nodes

395 */

396 outerPlanState(hashstate) = ExecInitNode(outerPlan(node), estate, eflags);

397

398 /*

399 * initialize our result slot and type. No need to build projection

400 * because this node doesn't do projections.

401 */

402 ExecInitResultTupleSlotTL(&hashstate->ps, &TTSOpsMinimalTuple);

403 hashstate->ps.ps_ProjInfo = NULL;

404

405 Assert(node->plan.qual == NIL);

406

407 /*

408 * Delay initialization of hash_expr until ExecInitHashJoin(). We cannot

409 * build the ExprState here as we don't yet know the join type we're going

410 * to be hashing values for and we need to know that before calling

411 * ExecBuildHash32Expr as the keep_nulls parameter depends on the join

412 * type.

413 */

414 hashstate->hash_expr = NULL;

415

416 return hashstate;

417}

ExecInitNode

PlanState * ExecInitNode(Plan *node, EState *estate, int eflags)

Definition: execProcnode.c:142

ExecInitResultTupleSlotTL

void ExecInitResultTupleSlotTL(PlanState *planstate, const TupleTableSlotOps *tts_ops)

Definition: execTuples.c:1988

TTSOpsMinimalTuple

const TupleTableSlotOps TTSOpsMinimalTuple

Definition: execTuples.c:86

ExecAssignExprContext

void ExecAssignExprContext(EState *estate, PlanState *planstate)

Definition: execUtils.c:485

EXEC_FLAG_BACKWARD

#define EXEC_FLAG_BACKWARD

Definition: executor.h:69

EXEC_FLAG_MARK

#define EXEC_FLAG_MARK

Definition: executor.h:70

ExecHash

static TupleTableSlot * ExecHash(PlanState *pstate)

Definition: nodeHash.c:90

makeNode

#define makeNode(_type_)

Definition: nodes.h:161

NIL

#define NIL

Definition: pg_list.h:68

HashState

Definition: execnodes.h:2810

HashState::hashtable

HashJoinTable hashtable

Definition: execnodes.h:2812

HashState::hash_expr

ExprState * hash_expr

Definition: execnodes.h:2813

PlanState::state

EState * state

Definition: execnodes.h:1161

PlanState::ps_ProjInfo

ProjectionInfo * ps_ProjInfo

Definition: execnodes.h:1199

PlanState::ExecProcNode

ExecProcNodeMtd ExecProcNode

Definition: execnodes.h:1165

Plan::qual

List * qual

Definition: plannodes.h:222

References Assert(), EXEC_FLAG_BACKWARD, EXEC_FLAG_MARK, ExecAssignExprContext(), ExecHash(), ExecInitNode(), ExecInitResultTupleSlotTL(), PlanState::ExecProcNode, HashState::hash_expr, HashState::hashtable, makeNode, NIL, outerPlan, outerPlanState, PlanState::plan, Hash::plan, HashState::ps, PlanState::ps_ProjInfo, Plan::qual, PlanState::state, and TTSOpsMinimalTuple.

Referenced by ExecInitNode().

◆ ExecParallelHashTableAlloc()

void ExecParallelHashTableAlloc ( HashJoinTable hashtable,

int batchno

)

Definition at line 3282 of file nodeHash.c.

3283{

3284 ParallelHashJoinBatch *batch = hashtable->batches[batchno].shared;

3285 dsa_pointer_atomic *buckets;

3286 int nbuckets = hashtable->parallel_state->nbuckets;

3287 int i;

3288

3289 batch->buckets =

3290 dsa_allocate(hashtable->area, sizeof(dsa_pointer_atomic) * nbuckets);

3291 buckets = (dsa_pointer_atomic *)

3292 dsa_get_address(hashtable->area, batch->buckets);

3293 for (i = 0; i < nbuckets; ++i)

3294 dsa_pointer_atomic_init(&buckets[i], InvalidDsaPointer);

3295}

dsa_pointer_atomic_init

#define dsa_pointer_atomic_init

Definition: dsa.h:64

dsa_allocate

#define dsa_allocate(area, size)

Definition: dsa.h:109

References HashJoinTableData::area, HashJoinTableData::batches, ParallelHashJoinBatch::buckets, dsa_allocate, dsa_get_address(), dsa_pointer_atomic_init, i, InvalidDsaPointer, ParallelHashJoinState::nbuckets, HashJoinTableData::parallel_state, and ParallelHashJoinBatchAccessor::shared.

Referenced by ExecHashTableCreate(), and ExecParallelHashJoinNewBatch().

◆ ExecParallelHashTableInsert()

void ExecParallelHashTableInsert ( HashJoinTable hashtable,

TupleTableSlot * slot,

uint32 hashvalue

)

Definition at line 1832 of file nodeHash.c.

1835{

1836 bool shouldFree;

1837 MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);

1838 dsa_pointer shared;

1839 int bucketno;

1840 int batchno;

1841

1842retry:

1843 ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);

1844

1845 if (batchno == 0)

1846 {

1847 HashJoinTuple hashTuple;

1848

1849 /* Try to load it into memory. */

1850 Assert(BarrierPhase(&hashtable->parallel_state->build_barrier) ==

1851 PHJ_BUILD_HASH_INNER);

1852 hashTuple = ExecParallelHashTupleAlloc(hashtable,

1853 HJTUPLE_OVERHEAD + tuple->t_len,

1854 &shared);

1855 if (hashTuple == NULL)

1856 goto retry;

1857

1858 /* Store the hash value in the HashJoinTuple header. */

1859 hashTuple->hashvalue = hashvalue;

1860 memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);

1861 HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));

1862

1863 /* Push it onto the front of the bucket's list */

1864 ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],

1865 hashTuple, shared);

1866 }

1867 else

1868 {

1869 size_t tuple_size = MAXALIGN(HJTUPLE_OVERHEAD + tuple->t_len);

1870

1871 Assert(batchno > 0);

1872

1873 /* Try to preallocate space in the batch if necessary. */

1874 if (hashtable->batches[batchno].preallocated < tuple_size)

1875 {

1876 if (!ExecParallelHashTuplePrealloc(hashtable, batchno, tuple_size))

1877 goto retry;

1878 }

1879

1880 Assert(hashtable->batches[batchno].preallocated >= tuple_size);

1881 hashtable->batches[batchno].preallocated -= tuple_size;

1882 sts_puttuple(hashtable->batches[batchno].inner_tuples, &hashvalue,

1883 tuple);

1884 }

1885 ++hashtable->batches[batchno].ntuples;

1886

1887 if (shouldFree)

1888 heap_free_minimal_tuple(tuple);

1889}

PHJ_BUILD_HASH_INNER

#define PHJ_BUILD_HASH_INNER

Definition: hashjoin.h:271

ExecParallelHashTuplePrealloc

static bool ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)

Definition: nodeHash.c:3554

ExecParallelHashTupleAlloc

static HashJoinTuple ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size, dsa_pointer *shared)

Definition: nodeHash.c:2969

ExecParallelHashPushTuple

static void ExecParallelHashPushTuple(dsa_pointer_atomic *head, HashJoinTuple tuple, dsa_pointer tuple_shared)

Definition: nodeHash.c:3474

sts_puttuple

void sts_puttuple(SharedTuplestoreAccessor *accessor, void *meta_data, MinimalTuple tuple)

Definition: sharedtuplestore.c:300

HashJoinTableData::shared

dsa_pointer_atomic * shared

Definition: hashjoin.h:313

ParallelHashJoinBatchAccessor::preallocated

size_t preallocated

Definition: hashjoin.h:212

ParallelHashJoinBatchAccessor::ntuples

size_t ntuples

Definition: hashjoin.h:213

References Assert(), BarrierPhase(), HashJoinTableData::batches, HashJoinTableData::buckets, ParallelHashJoinState::build_barrier, ExecFetchSlotMinimalTuple(), ExecHashGetBucketAndBatch(), ExecParallelHashPushTuple(), ExecParallelHashTupleAlloc(), ExecParallelHashTuplePrealloc(), HashJoinTupleData::hashvalue, heap_free_minimal_tuple(), HeapTupleHeaderClearMatch(), HJTUPLE_MINTUPLE, HJTUPLE_OVERHEAD, ParallelHashJoinBatchAccessor::inner_tuples, MAXALIGN, ParallelHashJoinBatchAccessor::ntuples, HashJoinTableData::parallel_state, PHJ_BUILD_HASH_INNER, ParallelHashJoinBatchAccessor::preallocated, HashJoinTableData::shared, sts_puttuple(), and MinimalTupleData::t_len.

Referenced by MultiExecParallelHash().

◆ ExecParallelHashTableInsertCurrentBatch()

void ExecParallelHashTableInsertCurrentBatch ( HashJoinTable hashtable,

TupleTableSlot * slot,

uint32 hashvalue

)

Definition at line 1898 of file nodeHash.c.

1901{

1902 bool shouldFree;

1903 MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot, &shouldFree);

1904 HashJoinTuple hashTuple;

1905 dsa_pointer shared;

1906 int batchno;

1907 int bucketno;

1908

1909 ExecHashGetBucketAndBatch(hashtable, hashvalue, &bucketno, &batchno);

1910 Assert(batchno == hashtable->curbatch);

1911 hashTuple = ExecParallelHashTupleAlloc(hashtable,

1912 HJTUPLE_OVERHEAD + tuple->t_len,

1913 &shared);

1914 hashTuple->hashvalue = hashvalue;

1915 memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);

1916 HeapTupleHeaderClearMatch(HJTUPLE_MINTUPLE(hashTuple));

1917 ExecParallelHashPushTuple(&hashtable->buckets.shared[bucketno],

1918 hashTuple, shared);

1919

1920 if (shouldFree)

1921 heap_free_minimal_tuple(tuple);

1922}

References Assert(), HashJoinTableData::buckets, HashJoinTableData::curbatch, ExecFetchSlotMinimalTuple(), ExecHashGetBucketAndBatch(), ExecParallelHashPushTuple(), ExecParallelHashTupleAlloc(), HashJoinTupleData::hashvalue, heap_free_minimal_tuple(), HeapTupleHeaderClearMatch(), HJTUPLE_MINTUPLE, HJTUPLE_OVERHEAD, HashJoinTableData::shared, and MinimalTupleData::t_len.

Referenced by ExecParallelHashJoinNewBatch().

◆ ExecParallelHashTableSetCurrentBatch()

void ExecParallelHashTableSetCurrentBatch ( HashJoinTable hashtable,

int batchno

)

Definition at line 3492 of file nodeHash.c.

3493{

3494 Assert(hashtable->batches[batchno].shared->buckets != InvalidDsaPointer);

3495

3496 hashtable->curbatch = batchno;

3497 hashtable->buckets.shared = (dsa_pointer_atomic *)

3498 dsa_get_address(hashtable->area,

3499 hashtable->batches[batchno].shared->buckets);

3500 hashtable->nbuckets = hashtable->parallel_state->nbuckets;

3501 hashtable->log2_nbuckets = pg_ceil_log2_32(hashtable->nbuckets);

3502 hashtable->current_chunk = NULL;

3503 hashtable->current_chunk_shared = InvalidDsaPointer;

3504 hashtable->batches[batchno].at_least_one_chunk = false;

3505}

HashJoinTableData::current_chunk_shared

dsa_pointer current_chunk_shared

Definition: hashjoin.h:362

ParallelHashJoinBatchAccessor::at_least_one_chunk

bool at_least_one_chunk

Definition: hashjoin.h:217

Referenced by ExecParallelHashIncreaseNumBatches(), ExecParallelHashIncreaseNumBuckets(), ExecParallelHashJoinNewBatch(), and MultiExecParallelHash().

◆ ExecParallelPrepHashTableForUnmatched()

bool ExecParallelPrepHashTableForUnmatched ( HashJoinState * hjstate )

Definition at line 2118 of file nodeHash.c.

2119{

2120 HashJoinTable hashtable = hjstate->hj_HashTable;

2121 int curbatch = hashtable->curbatch;

2122 ParallelHashJoinBatch *batch = hashtable->batches[curbatch].shared;

2123

2124 Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_PROBE);

2125

2126 /*

2127 * It would not be deadlock-free to wait on the batch barrier, because it

2128 * is in PHJ_BATCH_PROBE phase, and thus processes attached to it have

2129 * already emitted tuples. Therefore, we'll hold a wait-free election:

2130 * only one process can continue to the next phase, and all others detach

2131 * from this batch. They can still go any work on other batches, if there

2132 * are any.

2133 */

2134 if (!BarrierArriveAndDetachExceptLast(&batch->batch_barrier))

2135 {

2136 /* This process considers the batch to be done. */

2137 hashtable->batches[hashtable->curbatch].done = true;

2138

2139 /* Make sure any temporary files are closed. */

2140 sts_end_parallel_scan(hashtable->batches[curbatch].inner_tuples);

2141 sts_end_parallel_scan(hashtable->batches[curbatch].outer_tuples);

2142

2143 /*

2144 * Track largest batch we've seen, which would normally happen in

2145 * ExecHashTableDetachBatch().

2146 */

2147 hashtable->spacePeak =

2148 Max(hashtable->spacePeak,

2149 batch->size + sizeof(dsa_pointer_atomic) * hashtable->nbuckets);

2150 hashtable->curbatch = -1;

2151 return false;

2152 }

2153

2154 /* Now we are alone with this batch. */

2155 Assert(BarrierPhase(&batch->batch_barrier) == PHJ_BATCH_SCAN);

2156

2157 /*

2158 * Has another process decided to give up early and command all processes

2159 * to skip the unmatched scan?

2160 */

2161 if (batch->skip_unmatched)

2162 {

2163 hashtable->batches[hashtable->curbatch].done = true;

2164 ExecHashTableDetachBatch(hashtable);

2165 return false;

2166 }

2167

2168 /* Now prepare the process local state, just as for non-parallel join. */

2169 ExecPrepHashTableForUnmatched(hjstate);

2170

2171 return true;

2172}

ExecHashTableDetachBatch

void ExecHashTableDetachBatch(HashJoinTable hashtable)

Definition: nodeHash.c:3302

ExecPrepHashTableForUnmatched

void ExecPrepHashTableForUnmatched(HashJoinState *hjstate)

Definition: nodeHash.c:2097

HashJoinState::hj_HashTable

HashJoinTable hj_HashTable

Definition: execnodes.h:2258

ParallelHashJoinBatchAccessor::done

bool done

Definition: hashjoin.h:219

References Assert(), BarrierArriveAndDetachExceptLast(), BarrierPhase(), ParallelHashJoinBatch::batch_barrier, HashJoinTableData::batches, HashJoinTableData::curbatch, ParallelHashJoinBatchAccessor::done, ExecHashTableDetachBatch(), ExecPrepHashTableForUnmatched(), HashJoinState::hj_HashTable, ParallelHashJoinBatchAccessor::inner_tuples, Max, HashJoinTableData::nbuckets, ParallelHashJoinBatchAccessor::outer_tuples, PHJ_BATCH_PROBE, PHJ_BATCH_SCAN, ParallelHashJoinBatchAccessor::shared, ParallelHashJoinBatch::size, ParallelHashJoinBatch::skip_unmatched, HashJoinTableData::spacePeak, and sts_end_parallel_scan().

Referenced by ExecHashJoinImpl().

◆ ExecParallelScanHashBucket()

bool ExecParallelScanHashBucket ( HashJoinState * hjstate,

ExprContext * econtext

)

Definition at line 2046 of file nodeHash.c.

2048{

2049 ExprState *hjclauses = hjstate->hashclauses;

2050 HashJoinTable hashtable = hjstate->hj_HashTable;

2051 HashJoinTuple hashTuple = hjstate->hj_CurTuple;

2052 uint32 hashvalue = hjstate->hj_CurHashValue;

2053

2054 /*

2055 * hj_CurTuple is the address of the tuple last returned from the current

2056 * bucket, or NULL if it's time to start scanning a new bucket.

2057 */

2058 if (hashTuple != NULL)

2059 hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);

2060 else

2061 hashTuple = ExecParallelHashFirstTuple(hashtable,

2062 hjstate->hj_CurBucketNo);

2063

2064 while (hashTuple != NULL)

2065 {

2066 if (hashTuple->hashvalue == hashvalue)

2067 {

2068 TupleTableSlot *inntuple;

2069

2070 /* insert hashtable's tuple into exec slot so ExecQual sees it */

2071 inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),

2072 hjstate->hj_HashTupleSlot,

2073 false); /* do not pfree */

2074 econtext->ecxt_innertuple = inntuple;

2075

2076 if (ExecQualAndReset(hjclauses, econtext))

2077 {

2078 hjstate->hj_CurTuple = hashTuple;

2079 return true;

2080 }

2081 }

2082

2083 hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);

2084 }

2085

2086 /*

2087 * no match

2088 */

2089 return false;

2090}

ExecStoreMinimalTuple

TupleTableSlot * ExecStoreMinimalTuple(MinimalTuple mtup, TupleTableSlot *slot, bool shouldFree)

Definition: execTuples.c:1635

ExecQualAndReset

static bool ExecQualAndReset(ExprState *state, ExprContext *econtext)

Definition: executor.h:543

ExecParallelHashFirstTuple

static HashJoinTuple ExecParallelHashFirstTuple(HashJoinTable hashtable, int bucketno)

Definition: nodeHash.c:3444

ExecParallelHashNextTuple

static HashJoinTuple ExecParallelHashNextTuple(HashJoinTable hashtable, HashJoinTuple tuple)

Definition: nodeHash.c:3460

ExprContext::ecxt_innertuple

TupleTableSlot * ecxt_innertuple

Definition: execnodes.h:275

ExprState

Definition: execnodes.h:85

HashJoinState::hj_CurTuple

HashJoinTuple hj_CurTuple

Definition: execnodes.h:2262

HashJoinState::hashclauses

ExprState * hashclauses

Definition: execnodes.h:2256

HashJoinState::hj_CurHashValue

uint32 hj_CurHashValue

Definition: execnodes.h:2259

HashJoinState::hj_CurBucketNo

int hj_CurBucketNo

Definition: execnodes.h:2260

HashJoinState::hj_HashTupleSlot

TupleTableSlot * hj_HashTupleSlot

Definition: execnodes.h:2264

TupleTableSlot

Definition: tuptable.h:115

References ExprContext::ecxt_innertuple, ExecParallelHashFirstTuple(), ExecParallelHashNextTuple(), ExecQualAndReset(), ExecStoreMinimalTuple(), HashJoinState::hashclauses, HashJoinTupleData::hashvalue, HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurHashValue, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, and HJTUPLE_MINTUPLE.

Referenced by ExecHashJoinImpl().

◆ ExecParallelScanHashTableForUnmatched()

bool ExecParallelScanHashTableForUnmatched ( HashJoinState * hjstate,

ExprContext * econtext

)

Definition at line 2257 of file nodeHash.c.

2259{

2260 HashJoinTable hashtable = hjstate->hj_HashTable;

2261 HashJoinTuple hashTuple = hjstate->hj_CurTuple;

2262

2263 for (;;)

2264 {

2265 /*

2266 * hj_CurTuple is the address of the tuple last returned from the

2267 * current bucket, or NULL if it's time to start scanning a new

2268 * bucket.

2269 */

2270 if (hashTuple != NULL)

2271 hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);

2272 else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)

2273 hashTuple = ExecParallelHashFirstTuple(hashtable,

2274 hjstate->hj_CurBucketNo++);

2275 else

2276 break; /* finished all buckets */

2277

2278 while (hashTuple != NULL)

2279 {

2280 if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))

2281 {

2282 TupleTableSlot *inntuple;

2283

2284 /* insert hashtable's tuple into exec slot */

2285 inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),

2286 hjstate->hj_HashTupleSlot,

2287 false); /* do not pfree */

2288 econtext->ecxt_innertuple = inntuple;

2289

2290 /*

2291 * Reset temp memory each time; although this function doesn't

2292 * do any qual eval, the caller will, so let's keep it

2293 * parallel to ExecScanHashBucket.

2294 */

2295 ResetExprContext(econtext);

2296

2297 hjstate->hj_CurTuple = hashTuple;

2298 return true;

2299 }

2300

2301 hashTuple = ExecParallelHashNextTuple(hashtable, hashTuple);

2302 }

2303

2304 /* allow this loop to be cancellable */

2305 CHECK_FOR_INTERRUPTS();

2306 }

2307

2308 /*

2309 * no more unmatched tuples

2310 */

2311 return false;

2312}

ResetExprContext

#define ResetExprContext(econtext)

Definition: executor.h:647

HeapTupleHeaderHasMatch

static bool HeapTupleHeaderHasMatch(const MinimalTupleData *tup)

Definition: htup_details.h:706

CHECK_FOR_INTERRUPTS

#define CHECK_FOR_INTERRUPTS()

Definition: miscadmin.h:122

References CHECK_FOR_INTERRUPTS, ExprContext::ecxt_innertuple, ExecParallelHashFirstTuple(), ExecParallelHashNextTuple(), ExecStoreMinimalTuple(), HeapTupleHeaderHasMatch(), HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, HJTUPLE_MINTUPLE, HashJoinTableData::nbuckets, and ResetExprContext.

Referenced by ExecHashJoinImpl().

◆ ExecPrepHashTableForUnmatched()

void ExecPrepHashTableForUnmatched ( HashJoinState * hjstate )

Definition at line 2097 of file nodeHash.c.

2098{

2099 /*----------

2100 * During this scan we use the HashJoinState fields as follows:

2101 *

2102 * hj_CurBucketNo: next regular bucket to scan

2103 * hj_CurSkewBucketNo: next skew bucket (an index into skewBucketNums)

2104 * hj_CurTuple: last tuple returned, or NULL to start next bucket

2105 *----------

2106 */

2107 hjstate->hj_CurBucketNo = 0;

2108 hjstate->hj_CurSkewBucketNo = 0;

2109 hjstate->hj_CurTuple = NULL;

2110}

HashJoinState::hj_CurSkewBucketNo

int hj_CurSkewBucketNo

Definition: execnodes.h:2261

References HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurSkewBucketNo, and HashJoinState::hj_CurTuple.

Referenced by ExecHashJoinImpl(), and ExecParallelPrepHashTableForUnmatched().

◆ ExecReScanHash()

void ExecReScanHash ( HashState * node )

Definition at line 2374 of file nodeHash.c.

2375{

2376 PlanState *outerPlan = outerPlanState(node);

2377

2378 /*

2379 * if chgParam of subnode is not null then plan will be re-scanned by

2380 * first ExecProcNode.

2381 */

2382 if (outerPlan->chgParam == NULL)

2383 ExecReScan(outerPlan);

2384}

ExecReScan

void ExecReScan(PlanState *node)

Definition: execAmi.c:77

References ExecReScan(), outerPlan, and outerPlanState.

Referenced by ExecReScan().

◆ ExecScanHashBucket()

bool ExecScanHashBucket ( HashJoinState * hjstate,

ExprContext * econtext

)

Definition at line 1985 of file nodeHash.c.

1987{

1988 ExprState *hjclauses = hjstate->hashclauses;

1989 HashJoinTable hashtable = hjstate->hj_HashTable;

1990 HashJoinTuple hashTuple = hjstate->hj_CurTuple;

1991 uint32 hashvalue = hjstate->hj_CurHashValue;

1992

1993 /*

1994 * hj_CurTuple is the address of the tuple last returned from the current

1995 * bucket, or NULL if it's time to start scanning a new bucket.

1996 *

1997 * If the tuple hashed to a skew bucket then scan the skew bucket

1998 * otherwise scan the standard hashtable bucket.

1999 */

2000 if (hashTuple != NULL)

2001 hashTuple = hashTuple->next.unshared;

2002 else if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO)

2003 hashTuple = hashtable->skewBucket[hjstate->hj_CurSkewBucketNo]->tuples;

2004 else

2005 hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];

2006

2007 while (hashTuple != NULL)

2008 {

2009 if (hashTuple->hashvalue == hashvalue)

2010 {

2011 TupleTableSlot *inntuple;

2012

2013 /* insert hashtable's tuple into exec slot so ExecQual sees it */

2014 inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),

2015 hjstate->hj_HashTupleSlot,

2016 false); /* do not pfree */

2017 econtext->ecxt_innertuple = inntuple;

2018

2019 if (ExecQualAndReset(hjclauses, econtext))

2020 {

2021 hjstate->hj_CurTuple = hashTuple;

2022 return true;

2023 }

2024 }

2025

2026 hashTuple = hashTuple->next.unshared;

2027 }

2028

2029 /*

2030 * no match

2031 */

2032 return false;

2033}

References HashJoinTableData::buckets, ExprContext::ecxt_innertuple, ExecQualAndReset(), ExecStoreMinimalTuple(), HashJoinState::hashclauses, HashJoinTupleData::hashvalue, HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurHashValue, HashJoinState::hj_CurSkewBucketNo, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, HJTUPLE_MINTUPLE, INVALID_SKEW_BUCKET_NO, HashJoinTupleData::next, HashJoinTableData::skewBucket, HashSkewBucket::tuples, HashJoinTupleData::unshared, and HashJoinTableData::unshared.

Referenced by ExecHashJoinImpl().

◆ ExecScanHashTableForUnmatched()

bool ExecScanHashTableForUnmatched ( HashJoinState * hjstate,

ExprContext * econtext

)

Definition at line 2183 of file nodeHash.c.

2184{

2185 HashJoinTable hashtable = hjstate->hj_HashTable;

2186 HashJoinTuple hashTuple = hjstate->hj_CurTuple;

2187

2188 for (;;)

2189 {

2190 /*

2191 * hj_CurTuple is the address of the tuple last returned from the

2192 * current bucket, or NULL if it's time to start scanning a new

2193 * bucket.

2194 */

2195 if (hashTuple != NULL)

2196 hashTuple = hashTuple->next.unshared;

2197 else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)

2198 {

2199 hashTuple = hashtable->buckets.unshared[hjstate->hj_CurBucketNo];

2200 hjstate->hj_CurBucketNo++;

2201 }

2202 else if (hjstate->hj_CurSkewBucketNo < hashtable->nSkewBuckets)

2203 {

2204 int j = hashtable->skewBucketNums[hjstate->hj_CurSkewBucketNo];

2205

2206 hashTuple = hashtable->skewBucket[j]->tuples;

2207 hjstate->hj_CurSkewBucketNo++;

2208 }

2209 else

2210 break; /* finished all buckets */

2211

2212 while (hashTuple != NULL)

2213 {

2214 if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))

2215 {

2216 TupleTableSlot *inntuple;

2217

2218 /* insert hashtable's tuple into exec slot */

2219 inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),

2220 hjstate->hj_HashTupleSlot,

2221 false); /* do not pfree */

2222 econtext->ecxt_innertuple = inntuple;

2223

2224 /*

2225 * Reset temp memory each time; although this function doesn't

2226 * do any qual eval, the caller will, so let's keep it

2227 * parallel to ExecScanHashBucket.

2228 */

2229 ResetExprContext(econtext);

2230

2231 hjstate->hj_CurTuple = hashTuple;

2232 return true;

2233 }

2234

2235 hashTuple = hashTuple->next.unshared;

2236 }

2237

2238 /* allow this loop to be cancellable */

2239 CHECK_FOR_INTERRUPTS();

2240 }

2241

2242 /*

2243 * no more unmatched tuples

2244 */

2245 return false;

2246}

References HashJoinTableData::buckets, CHECK_FOR_INTERRUPTS, ExprContext::ecxt_innertuple, ExecStoreMinimalTuple(), HeapTupleHeaderHasMatch(), HashJoinState::hj_CurBucketNo, HashJoinState::hj_CurSkewBucketNo, HashJoinState::hj_CurTuple, HashJoinState::hj_HashTable, HashJoinState::hj_HashTupleSlot, HJTUPLE_MINTUPLE, j, HashJoinTableData::nbuckets, HashJoinTupleData::next, HashJoinTableData::nSkewBuckets, ResetExprContext, HashJoinTableData::skewBucket, HashJoinTableData::skewBucketNums, HashSkewBucket::tuples, HashJoinTupleData::unshared, and HashJoinTableData::unshared.

Referenced by ExecHashJoinImpl().

◆ ExecShutdownHash()

void ExecShutdownHash ( HashState * node )

Definition at line 2824 of file nodeHash.c.

2825{

2826 /* Allocate save space if EXPLAIN'ing and we didn't do so already */

2827 if (node->ps.instrument && !node->hinstrument)

2828 node->hinstrument = palloc0_object(HashInstrumentation);

2829 /* Now accumulate data for the current (final) hash table */

2830 if (node->hinstrument && node->hashtable)

2831 ExecHashAccumInstrumentation(node->hinstrument, node->hashtable);

2832}

palloc0_object

#define palloc0_object(type)

Definition: fe_memutils.h:75

ExecHashAccumInstrumentation

void ExecHashAccumInstrumentation(HashInstrumentation *instrument, HashJoinTable hashtable)

Definition: nodeHash.c:2870

References ExecHashAccumInstrumentation(), HashState::hashtable, HashState::hinstrument, PlanState::instrument, palloc0_object, and HashState::ps.

Referenced by ExecShutdownNode_walker().

◆ MultiExecHash()

Node * MultiExecHash ( HashState * node )

Definition at line 104 of file nodeHash.c.

105{

106 /* must provide our own instrumentation support */

107 if (node->ps.instrument)

108 InstrStartNode(node->ps.instrument);

109

110 if (node->parallel_state != NULL)

111 MultiExecParallelHash(node);

112 else

113 MultiExecPrivateHash(node);

114

115 /* must provide our own instrumentation support */

116 if (node->ps.instrument)

117 InstrStopNode(node->ps.instrument, node->hashtable->partialTuples);

118

119 /*

120 * We do not return the hash table directly because it's not a subtype of

121 * Node, and so would violate the MultiExecProcNode API. Instead, our

122 * parent Hashjoin node is expected to know how to fish it out of our node

123 * state. Ugly but not really worth cleaning up, since Hashjoin knows

124 * quite a bit more about Hash besides that.

125 */

126 return NULL;

127}

InstrStartNode

void InstrStartNode(Instrumentation *instr)

Definition: instrument.c:68

InstrStopNode

void InstrStopNode(Instrumentation *instr, double nTuples)

Definition: instrument.c:84

MultiExecParallelHash

static void MultiExecParallelHash(HashState *node)

Definition: nodeHash.c:218

MultiExecPrivateHash

static void MultiExecPrivateHash(HashState *node)

Definition: nodeHash.c:137

HashState::parallel_state

struct ParallelHashJoinState * parallel_state

Definition: execnodes.h:2834

References HashState::hashtable, InstrStartNode(), InstrStopNode(), PlanState::instrument, MultiExecParallelHash(), MultiExecPrivateHash(), HashState::parallel_state, HashJoinTableData::partialTuples, and HashState::ps.

Referenced by MultiExecProcNode().