1/*-------------------------------------------------------------------------
4 * dynamic chained hash tables
6 * dynahash.c supports both local-to-a-backend hash tables and hash tables in
7 * shared memory. For shared hash tables, it is the caller's responsibility
8 * to provide appropriate access interlocking. The simplest convention is
9 * that a single LWLock protects the whole hash table. Searches (HASH_FIND or
10 * hash_seq_search) need only shared lock, but any update requires exclusive
11 * lock. For heavily-used shared tables, the single-lock approach creates a
12 * concurrency bottleneck, so we also support "partitioned" locking wherein
13 * there are multiple LWLocks guarding distinct subsets of the table. To use
14 * a hash table in partitioned mode, the HASH_PARTITION flag must be given
15 * to hash_create. This prevents any attempt to split buckets on-the-fly.
16 * Therefore, each hash bucket chain operates independently, and no fields
17 * of the hash header change after init except nentries and freeList.
18 * (A partitioned table uses multiple copies of those fields, guarded by
19 * spinlocks, for additional concurrency.)
20 * This lets any subset of the hash buckets be treated as a separately
21 * lockable partition. We expect callers to use the low-order bits of a
22 * lookup key's hash value as a partition number --- this will work because
23 * of the way calc_bucket() maps hash values to bucket numbers.
25 * The memory allocator function should match malloc's semantics of returning
26 * NULL on failure. (This is essential for hash tables in shared memory.
27 * For hash tables in local memory, we used to use palloc() which will throw
28 * error on failure; but we no longer do, so it's untested whether this
29 * module will still cope with that behavior.)
31 * dynahash.c provides support for these types of lookup keys:
33 * 1. Null-terminated C strings (truncated if necessary to fit in keysize),
34 * compared as though by strcmp(). This is selected by specifying the
35 * HASH_STRINGS flag to hash_create.
37 * 2. Arbitrary binary data of size keysize, compared as though by memcmp().
38 * (Caller must ensure there are no undefined padding bits in the keys!)
39 * This is selected by specifying the HASH_BLOBS flag to hash_create.
41 * 3. More complex key behavior can be selected by specifying user-supplied
42 * hashing, comparison, and/or key-copying functions. At least a hashing
43 * function must be supplied; comparison defaults to memcmp() and key copying
44 * to memcpy() when a user-defined hashing function is selected.
46 * Compared to simplehash, dynahash has the following benefits:
48 * - It supports partitioning, which is useful for shared memory access using
50 * - Shared memory hashes are allocated in a fixed size area at startup and
51 * are discoverable by name from other processes.
52 * - Because entries don't need to be moved in the case of hash conflicts,
53 * dynahash has better performance for large entries.
54 * - Guarantees stable pointers to entries.
56 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
57 * Portions Copyright (c) 1994, Regents of the University of California
61 * src/backend/utils/hash/dynahash.c
63 *-------------------------------------------------------------------------
69 * Dynamic hashing, after CACM April 1988 pp 446-457, by Per-Ake Larson.
70 * Coded into C, with minor code improvements, and with hsearch(3) interface,
71 * by ejp@ausmelb.oz, Jul 26, 1988: 13:16;
72 * also, hcreate/hdestroy routines added to simulate hsearch(3).
74 * These routines simulate hsearch(3) and family, with the important
75 * difference that the hash table is dynamic - can grow indefinitely
76 * beyond its original size (as supplied to hcreate()).
78 * Performance appears to be comparable to that of hsearch(3).
79 * The 'source-code' options referred to in hsearch(3)'s 'man' page
80 * are not implemented; otherwise functionality is identical.
82 * Compilation controls:
83 * HASH_STATISTICS causes some usage statistics to be maintained, which can be
84 * logged by calling hash_stats().
86 * Problems & fixes to ejp@ausmelb.oz. WARNING: relies on pre-processor
87 * concatenation property, in probably unnecessary code 'optimization'.
89 * Modified margo@postgres.berkeley.edu February 1990
90 * added multiple table interface
91 * Modified by sullivan@postgres.berkeley.edu April 1990
92 * changed ctl structure for shared memory
111 * A hash table has a top-level "directory", each of whose entries points
112 * to a "segment" of ssize bucket headers. The maximum number of hash
113 * buckets is thus dsize * ssize (but dsize may be expansible). Of course,
114 * the number of records in the table can be larger, but we don't want a
115 * whole lot of records per bucket or performance goes down.
117 * In a hash table allocated in shared memory, the directory cannot be
118 * expanded because it must stay at a fixed address. The directory size
119 * should be selected using hash_select_dirsize (and you'd better have
120 * a good idea of the maximum number of entries!). For non-shared hash
121 * tables, the initial directory size can be left at the default.
123 #define DEF_SEGSIZE 256
124 #define DEF_SEGSIZE_SHIFT 8 /* must be log2(DEF_SEGSIZE) */
125 #define DEF_DIRSIZE 256
127/* Number of freelists to be used for a partitioned hash table. */
128 #define NUM_FREELISTS 32
130/* A hash bucket is a linked list of HASHELEMENTs */
133/* A hash segment is an array of bucket headers */
139 * In a partitioned hash table, each freelist is associated with a specific
140 * set of hashcodes, as determined by the FREELIST_IDX() macro below.
141 * nentries tracks the number of live hashtable entries having those hashcodes
142 * (NOT the number of entries in the freelist, as you might expect).
144 * The coverage of a freelist might be more or less than one partition, so it
145 * needs its own lock rather than relying on caller locking. Relying on that
146 * wouldn't work even if the coverage was the same, because of the occasional
147 * need to "borrow" entries from another freelist; see get_hash_entry().
149 * Using an array of FreeListData instead of separate arrays of mutexes,
150 * nentries and freeLists helps to reduce sharing of cache lines between
155 slock_t
mutex;
/* spinlock for this freelist */
161 * Header structure for a hash table --- contains all changeable info
163 * In a shared-memory hash table, the HASHHDR is in shared memory, while
164 * each backend has a local HTAB struct. For a non-shared table, there isn't
165 * any functional difference between HASHHDR and HTAB, but we separate them
166 * anyway to share code between shared and non-shared tables.
171 * The freelist can become a point of contention in high-concurrency hash
172 * tables, so we use an array of freelists, each with its own mutex and
173 * nentries count, instead of just a single one. Although the freelists
174 * normally operate independently, we will scavenge entries from freelists
175 * other than a hashcode's default freelist when necessary.
177 * If the hash table is not partitioned, only freeList[0] is used and its
178 * spinlock is not used at all; callers' locking is assumed sufficient.
182 /* These fields can change, but not in a partitioned table */
183 /* Also, dsize can't change in a shared table, even if unpartitioned */
190 /* These fields are fixed at hashtable creation */
196 int sshift;
/* segment shift = log2(ssize) */
200#ifdef HASH_STATISTICS
203 * Count statistics here. NB: stats code doesn't bother with mutex, so
204 * counts could be corrupted a bit in a partitioned table.
212 #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
214 #define FREELIST_IDX(hctl, hashcode) \
215 (IS_PARTITIONED(hctl) ? (hashcode) % NUM_FREELISTS : 0)
218 * Top control structure for a hashtable --- in a shared table, each backend
219 * has its own copy (OK since no fields change at runtime)
230 char *
tabname;
/* table name (for error messages) */
231 bool isshared;
/* true if table is in shared memory */
233 /* freezing a shared table isn't allowed, so we can keep state here */
234 bool frozen;
/* true = no more inserts allowed */
236 /* We keep local copies of these fixed values to reduce contention */
239 int sshift;
/* segment shift = log2(ssize) */
242 * In a USE_VALGRIND build, non-shared hashtables keep an slist chain of
243 * all the element blocks they have allocated. This pacifies Valgrind,
244 * which would otherwise often claim that the element blocks are "possibly
245 * lost" for lack of any non-interior pointers to their starts.
253 * Key (also entry) part of a HASHELEMENT
255 #define ELEMENTKEY(helem) (((char *)(helem)) + MAXALIGN(sizeof(HASHELEMENT)))
258 * Obtain element pointer given pointer to key
260 #define ELEMENT_FROM_KEY(key) \
261 ((HASHELEMENT *) (((char *) (key)) - MAXALIGN(sizeof(HASHELEMENT))))
264 * Fast MOD arithmetic, assuming that y is a power of 2 !
266 #define MOD(x,y) ((x) & ((y)-1))
269 * Private function prototypes
292 * memory allocation support
306 * HashCompareFunc for string keys
308 * Because we copy keys with strlcpy(), they will be truncated at keysize-1
309 * bytes, so we can only compare that many ... hence strncmp is almost but
310 * not quite the right thing.
315 return strncmp(key1, key2, keysize - 1);
319/************************** CREATE ROUTINES **********************/
322 * hash_create -- create a new dynamic hash table
324 * tabname: a name for the table (for debugging purposes)
325 * nelem: maximum number of elements expected
326 * *info: additional table parameters, as indicated by flags
327 * flags: bitmask indicating which parameters to take from *info
329 * The flags value *must* include HASH_ELEM. (Formerly, this was nominally
330 * optional, but the default keysize and entrysize values were useless.)
331 * The flags value must also include exactly one of HASH_STRINGS, HASH_BLOBS,
332 * or HASH_FUNCTION, to define the key hashing semantics (C strings,
333 * binary blobs, or custom, respectively). Callers specifying a custom
334 * hash function will likely also want to use HASH_COMPARE, and perhaps
335 * also HASH_KEYCOPY, to control key comparison and copying.
336 * Another often-used flag is HASH_CONTEXT, to allocate the hash table
337 * under info->hcxt rather than under TopMemoryContext; the default
338 * behavior is only suitable for session-lifespan hash tables.
339 * Other flags bits are special-purpose and seldom used, except for those
340 * associated with shared-memory hash tables, for which see ShmemInitHash().
342 * Fields in *info are read only when the associated flags bit is set.
343 * It is not necessary to initialize other fields of *info.
344 * Neither tabname nor *info need persist after the hash_create() call.
346 * Note: It is deprecated for callers of hash_create() to explicitly specify
347 * string_hash, tag_hash, uint32_hash, or oid_hash. Just set HASH_STRINGS or
348 * HASH_BLOBS. Use HASH_FUNCTION only when you want something other than
351 * Note: for a shared-memory hashtable, nelem needs to be a pretty good
352 * estimate, since we can't expand the table on the fly. But an unshared
353 * hashtable can be expanded on-the-fly, so it's better for nelem to be
354 * on the small side and let the table grow if it's exceeded. An overly
355 * large nelem will penalize hash_seq_search speed without buying much.
364 * Hash tables now allocate space for key and data, but you have to say
365 * how much space to allocate.
372 * For shared hash tables, we have a local hash header (HTAB struct) that
373 * we allocate in TopMemoryContext; all else is in shared memory.
375 * For non-shared hash tables, everything including the hash header is in
376 * a memory context created specially for the hash table --- this makes
377 * hash_destroy very simple. The memory context is made a child of either
378 * a context specified by the caller, or TopMemoryContext if nothing is
383 /* Set up to allocate the hash header */
388 /* Create the hash table's private memory context */
398 /* Initialize the hash header, plus a copy of the table name */
400 sizeof(
HTAB) + strlen(tabname) + 1);
403 hashp->
tabname = (
char *) (hashp + 1);
404 strcpy(hashp->
tabname, tabname);
406 /* If we have a private context, label it with hashtable's name */
411 * Select the appropriate hash function (see comments at head of file).
421 /* We can optimize hashing for common key sizes */
430 * string_hash used to be considered the default hash method, and in a
431 * non-assert build it effectively still is. But we now consider it
432 * an assertion error to not say HASH_STRINGS explicitly. To help
433 * catch mistaken usage of HASH_STRINGS, we also insist on a
434 * reasonably long string length: if the keysize is only 4 or 8 bytes,
435 * it's almost certainly an integer or pointer not a string.
444 * If you don't specify a match function, it defaults to string_compare if
445 * you used string_hash, and to memcmp otherwise.
447 * Note: explicitly specifying string_hash is deprecated, because this
448 * might not work for callers in loadable modules on some platforms due to
449 * referencing a trampoline instead of the string_hash function proper.
450 * Specify HASH_STRINGS instead.
457 hashp->
match = memcmp;
460 * Similarly, the key-copying function defaults to strlcpy or memcpy.
467 * The signature of keycopy is meant for memcpy(), which returns
468 * void*, but strlcpy() returns size_t. Since we never use the return
469 * value of keycopy, and size_t is pretty much always the same size as
470 * void *, this should be safe. The extra cast in the middle is to
471 * avoid warnings from -Wcast-function-type.
478 /* And select the entry allocation function, too. */
487 * ctl structure and directory are preallocated for shared memory
488 * tables. Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
496 /* hash table already exists, we're just attaching to it */
499 /* make local copies of some heavily-used values */
510 /* setup hash table defaults */
522 (
errcode(ERRCODE_OUT_OF_MEMORY),
523 errmsg(
"out of memory")));
534 /* Doesn't make sense to partition a local hash table */
538 * The number of partitions had better be a power of 2. Also, it must
539 * be less than INT_MAX (see init_htab()), so call the int version of
551 /* ssize had better be a power of 2 */
556 * SHM hash tables have fixed directory size passed by the caller.
564 /* remember the entry sizes, too */
568 /* make local copies of heavily-used constant fields */
573 /* Build the hash directory structure */
578 * For a shared hash table, preallocate the requested number of elements.
579 * This reduces problems with run-time out-of-shared-memory conditions.
581 * For a non-shared hash table, preallocate the requested number of
582 * elements if it's less than our chosen nelem_alloc. This avoids wasting
583 * space if the caller correctly estimates a small table size.
586 nelem < hctl->nelem_alloc)
594 * If hash table is partitioned, give each freelist an equal share of
595 * the initial allocation. Otherwise only freeList[0] is used.
600 freelist_partitions = 1;
602 nelem_alloc = nelem / freelist_partitions;
603 if (nelem_alloc <= 0)
607 * Make sure we'll allocate all the requested elements; freeList[0]
608 * gets the excess if the request isn't divisible by NUM_FREELISTS.
610 if (nelem_alloc * freelist_partitions < nelem)
612 nelem - nelem_alloc * (freelist_partitions - 1);
614 nelem_alloc_first = nelem_alloc;
616 for (
i = 0;
i < freelist_partitions;
i++)
618 int temp = (
i == 0) ? nelem_alloc_first : nelem_alloc;
622 (
errcode(ERRCODE_OUT_OF_MEMORY),
623 errmsg(
"out of memory")));
627 /* Set isfixed if requested, but not till after we build initial entries */
635 * Set default HASHHDR parameters.
649 /* table has no fixed maximum size */
655 hctl->
isfixed =
false;
/* can be enlarged */
657#ifdef HASH_STATISTICS
658 hctl->accesses = hctl->collisions = hctl->expansions = 0;
663 * Given the user-specified entry size, choose nelem_alloc, ie, how many
664 * elements to add to the hash table when we need more.
673 /* Each element has a HASHELEMENT header plus user data. */
674 /* NB: this had better match element_alloc() */
678 * The idea here is to choose nelem_alloc at least 32, but round up so
679 * that the allocation request will be a power of 2 or just less. This
680 * makes little difference for hash tables in shared memory, but for hash
681 * tables managed by palloc, the allocation request will be rounded up to
682 * a power of 2 anyway. If we fail to take this into account, we'll waste
683 * as much as half the allocated space.
685 allocSize = 32 * 4;
/* assume elementSize at least 8 */
689 nelem_alloc = allocSize / elementSize;
690 }
while (nelem_alloc < 32);
696 * Compute derived fields of hctl and build the initial directory/segment
709 * initialize mutexes if it's a partitioned table
716 * Allocate space for the next greater power of two number of buckets,
717 * assuming a desired maximum load factor of 1.
722 * In a partitioned table, nbuckets must be at least equal to
723 * num_partitions; were it less, keys with apparently different partition
724 * numbers would map to the same bucket, breaking partition independence.
725 * (Normally nbuckets will be much bigger; this is just a safety check.)
727 while (nbuckets < hctl->num_partitions)
734 * Figure number of directory segments needed, round up to a power of 2
736 nsegs = (nbuckets - 1) / hctl->
ssize + 1;
740 * Make sure directory is big enough. If pre-allocated directory is too
741 * small, choke (caller screwed up).
743 if (nsegs > hctl->
dsize)
751 /* Allocate a directory */
761 /* Allocate initial segments */
762 for (segp = hashp->
dir; hctl->
nsegs < nsegs; hctl->
nsegs++, segp++)
769 /* Choose number of entries to allocate at a time */
776 * Estimate the space needed for a hashtable containing the given number
777 * of entries of given size.
778 * NOTE: this is used to estimate the footprint of hashtables in shared
779 * memory; therefore it does not count HTAB which is in local memory.
780 * NB: assumes that all hash structure parameters have default values!
793 /* estimate number of buckets wanted */
795 /* # of segments needed for nBuckets */
797 /* directory entries */
799 while (nDirEntries < nSegments)
800 nDirEntries <<= 1;
/* dir_alloc doubles dsize at each call */
802 /* fixed control info */
809 /* elements --- allocated in groups of choose_nelem_alloc() entries */
811 nElementAllocs = (num_entries - 1) / elementAllocCnt + 1;
815 mul_size(elementAllocCnt, elementSize)));
821 * Select an appropriate directory size for a hashtable with the given
822 * maximum number of entries.
823 * This is only needed for hashtables in shared memory, whose directories
824 * cannot be expanded dynamically.
825 * NB: assumes that all hash structure parameters have default values!
827 * XXX this had better agree with the behavior of init_htab()...
836 /* estimate number of buckets wanted */
838 /* # of segments needed for nBuckets */
840 /* directory entries */
842 while (nDirEntries < nSegments)
843 nDirEntries <<= 1;
/* dir_alloc doubles dsize at each call */
849 * Compute the required initial memory allocation for a shared-memory
850 * hashtable with the given parameters. We need space for the HASHHDR
851 * and for the (non expansible) directory.
862/********************** DESTROY ROUTINES ************************/
869 /* allocation method must be one we know how to free, too */
871 /* so this hashtable must have its own context */
877 * Free everything by destroying the hash table's memory context.
886#ifdef HASH_STATISTICS
891 caller != NULL ? caller :
"(unknown)", hashp->
tabname, hctl->accesses,
897/*******************************SEARCH ROUTINES *****************************/
901 * get_hash_value -- exported routine to calculate a key's hash value
903 * We export this because for partitioned tables, callers need to compute
904 * the partition number (from the low-order bits of the hash value) before
913/* Convert a hash value to a bucket number */
927 * hash_search -- look up key in table and perform action
928 * hash_search_with_hash_value -- same, with key's hash value already computed
931 * HASH_FIND: look up key in table
932 * HASH_ENTER: look up key in table, creating entry if not present
933 * HASH_ENTER_NULL: same, but return NULL if out of memory
934 * HASH_REMOVE: look up key in table, remove entry if present
936 * Return value is a pointer to the element found/entered/removed if any,
937 * or NULL if no match was found. (NB: in the case of the REMOVE action,
938 * the result is a dangling pointer that shouldn't be dereferenced!)
940 * HASH_ENTER will normally ereport a generic "out of memory" error if
941 * it is unable to create a new entry. The HASH_ENTER_NULL operation is
942 * the same except it will return NULL if out of memory.
944 * If foundPtr isn't NULL, then *foundPtr is set true if we found an
945 * existing entry in the table, false otherwise. This is needed in the
946 * HASH_ENTER case, but is redundant with the return value otherwise.
948 * For hash_search_with_hash_value, the hashvalue parameter must have been
949 * calculated with get_hash_value().
978#ifdef HASH_STATISTICS
983 * If inserting, check if it is time to split a bucket.
985 * NOTE: failure to expand table is not a fatal error, it just means we
986 * have to run at higher fill factor than we wanted. However, if we're
987 * using the palloc allocator then it will throw error anyway on
988 * out-of-memory, so we must do this before modifying the table.
993 * Can't split if running in partitioned mode, nor if frozen, nor if
994 * table is the subject of any active hash_seq_search scans.
1003 * Do the initial lookup
1006 currBucket = *prevBucketPtr;
1009 * Follow collision chain looking for matching key
1011 match = hashp->
match;
/* save one fetch in inner loop */
1012 keysize = hashp->
keysize;
/* ditto */
1014 while (currBucket != NULL)
1016 if (currBucket->
hashvalue == hashvalue &&
1017 match(
ELEMENTKEY(currBucket), keyPtr, keysize) == 0)
1019 prevBucketPtr = &(currBucket->
link);
1020 currBucket = *prevBucketPtr;
1021#ifdef HASH_STATISTICS
1027 *foundPtr = (bool) (currBucket != NULL);
1035 if (currBucket != NULL)
1040 if (currBucket != NULL)
1042 /* if partitioned, must lock to touch nentries and freeList */
1046 /* delete the record from the appropriate nentries counter. */
1050 /* remove record from hash bucket's chain. */
1051 *prevBucketPtr = currBucket->
link;
1053 /* add the record to the appropriate freelist. */
1061 * better hope the caller is synchronizing access to this
1062 * element, because someone else is going to reuse it the next
1063 * time something is added to the table
1071 /* Return existing element if found, else create one */
1072 if (currBucket != NULL)
1075 /* disallow inserts if frozen */
1077 elog(
ERROR,
"cannot insert into frozen hashtable \"%s\"",
1081 if (currBucket == NULL)
1086 /* report a generic message */
1089 (
errcode(ERRCODE_OUT_OF_MEMORY),
1090 errmsg(
"out of shared memory")));
1093 (
errcode(ERRCODE_OUT_OF_MEMORY),
1094 errmsg(
"out of memory")));
1097 /* link into hashbucket chain */
1098 *prevBucketPtr = currBucket;
1099 currBucket->
link = NULL;
1101 /* copy key into record */
1106 * Caller is expected to fill the data field on return. DO NOT
1107 * insert any code that could possibly throw error here, as doing
1108 * so would leave the table entry incomplete and hence corrupt the
1109 * caller's data structure.
1117 return NULL;
/* keep compiler quiet */
1121 * hash_update_hash_key -- change the hash key of an existing table entry
1123 * This is equivalent to removing the entry, making a new entry, and copying
1124 * over its data, except that the entry never goes to the table's freelist.
1125 * Therefore this cannot suffer an out-of-memory failure, even if there are
1126 * other processes operating in other partitions of the hashtable.
1128 * Returns true if successful, false if the requested new hash key is already
1129 * present. Throws error if the specified entry pointer isn't actually a
1132 * NB: currently, there is no special case for old and new hash keys being
1133 * identical, which means we'll report false for that situation. This is
1134 * preferable for existing uses.
1136 * NB: for a partitioned hashtable, caller must hold lock on both relevant
1137 * partitions, if the new hash key would belong to a different partition.
1141 void *existingEntry,
1142 const void *newKeyPtr)
1154#ifdef HASH_STATISTICS
1160 /* disallow updates if frozen */
1162 elog(
ERROR,
"cannot update in frozen hashtable \"%s\"",
1166 * Lookup the existing element using its saved hash value. We need to do
1167 * this to be able to unlink it from its hash chain, but as a side benefit
1168 * we can verify the validity of the passed existingEntry pointer.
1172 currBucket = *prevBucketPtr;
1174 while (currBucket != NULL)
1176 if (currBucket == existingElement)
1178 prevBucketPtr = &(currBucket->
link);
1179 currBucket = *prevBucketPtr;
1182 if (currBucket == NULL)
1183 elog(
ERROR,
"hash_update_hash_key argument is not in hashtable \"%s\"",
1186 oldPrevPtr = prevBucketPtr;
1189 * Now perform the equivalent of a HASH_ENTER operation to locate the hash
1190 * chain we want to put the entry into.
1192 newhashvalue = hashp->
hash(newKeyPtr, hashp->
keysize);
1194 currBucket = *prevBucketPtr;
1197 * Follow collision chain looking for matching key
1199 match = hashp->
match;
/* save one fetch in inner loop */
1200 keysize = hashp->
keysize;
/* ditto */
1202 while (currBucket != NULL)
1204 if (currBucket->
hashvalue == newhashvalue &&
1205 match(
ELEMENTKEY(currBucket), newKeyPtr, keysize) == 0)
1207 prevBucketPtr = &(currBucket->
link);
1208 currBucket = *prevBucketPtr;
1209#ifdef HASH_STATISTICS
1214 if (currBucket != NULL)
1215 return false;
/* collision with an existing entry */
1217 currBucket = existingElement;
1220 * If old and new hash values belong to the same bucket, we need not
1221 * change any chain links, and indeed should not since this simplistic
1222 * update will corrupt the list if currBucket is the last element. (We
1223 * cannot fall out earlier, however, since we need to scan the bucket to
1224 * check for duplicate keys.)
1226 if (bucket != newbucket)
1228 /* OK to remove record from old hash bucket's chain. */
1229 *oldPrevPtr = currBucket->
link;
1231 /* link into new hashbucket chain */
1232 *prevBucketPtr = currBucket;
1233 currBucket->
link = NULL;
1236 /* copy new key into record */
1240 /* rest of record is untouched */
1246 * Allocate a new hashtable entry if possible; return NULL if out of memory.
1247 * (Or, if the underlying space allocator throws error for out-of-memory,
1248 * we won't return at all.)
1258 /* if partitioned, must lock to touch nentries and freeList */
1262 /* try to get an entry from the freelist */
1265 if (newElement != NULL)
1272 * No free elements in this freelist. In a partitioned table, there
1273 * might be entries in other freelists, but to reduce contention we
1274 * prefer to first try to get another chunk of buckets from the main
1275 * shmem allocator. If that fails, though, we *MUST* root through all
1276 * the other freelists before giving up. There are multiple callers
1277 * that assume that they can allocate every element in the initially
1278 * requested table size, or that deleting an element guarantees they
1279 * can insert a new element, even if shared memory is entirely full.
1280 * Failing because the needed element is in a different freelist is
1285 int borrow_from_idx;
1288 return NULL;
/* out of memory */
1290 /* try to borrow element from another freelist */
1291 borrow_from_idx = freelist_idx;
1295 if (borrow_from_idx == freelist_idx)
1296 break;
/* examined all freelists, fail */
1301 if (newElement != NULL)
1306 /* careful: count the new element in its proper freelist */
1317 /* no elements available to borrow either, so out of memory */
1322 /* remove entry from freelist, bump nentries */
1333 * hash_get_num_entries -- get the number of entries in a hashtable
1342 * We currently don't bother with acquiring the mutexes; it's only
1343 * sensible to call this function if you've got lock on all partitions of
1356 * hash_seq_init/_search/_term
1357 * Sequentially search through hash table and return
1358 * all the elements one by one, return NULL when no more.
1360 * hash_seq_term should be called if and only if the scan is abandoned before
1361 * completion; if hash_seq_search returns NULL then it has already done the
1362 * end-of-scan cleanup.
1364 * NOTE: caller may delete the returned element before continuing the scan.
1365 * However, deleting any other element while the scan is in progress is
1366 * UNDEFINED (it might be the one that curIndex is pointing at!). Also,
1367 * if elements are added to the table while the scan is in progress, it is
1368 * unspecified whether they will be visited by the scan or not.
1370 * NOTE: it is possible to use hash_seq_init/hash_seq_search without any
1371 * worry about hash_seq_term cleanup, if the hashtable is first locked against
1372 * further insertions by calling hash_freeze.
1374 * NOTE: to use this with a partitioned hashtable, caller had better hold
1375 * at least shared lock on all partitions of the table throughout the scan!
1376 * We can cope with insertions or deletions by our own backend, but *not*
1377 * with concurrent insertions or deletions by another.
1382 status->
hashp = hashp;
1391 * Same as above but scan by the given hash value.
1392 * See also hash_seq_search().
1394 * NOTE: the default hash function doesn't match syscache hash function.
1395 * Thus, if you're going to use this function in syscache callback, make sure
1396 * you're using custom hash function. See relatt_cache_syshash()
1430 * Scan entries only in the current bucket because only this bucket
1431 * can contain entries with the given hash value.
1433 while ((curElem = status->
curEntry) != NULL)
1445 if ((curElem = status->
curEntry) != NULL)
1447 /* Continuing scan of curBucket... */
1449 if (status->
curEntry == NULL)
/* end of this bucket */
1455 * Search for next nonempty bucket starting at curBucket.
1458 hashp = status->
hashp;
1460 ssize = hashp->
ssize;
1463 if (curBucket > max_bucket)
1466 return NULL;
/* search is done */
1470 * first find the right segment in the table directory.
1472 segment_num = curBucket >> hashp->
sshift;
1473 segment_ndx =
MOD(curBucket, ssize);
1475 segp = hashp->
dir[segment_num];
1478 * Pick up the first item in this bucket's chain. If chain is not empty
1479 * we can begin searching it. Otherwise we have to advance to find the
1480 * next nonempty bucket. We try to optimize that case since searching a
1481 * near-empty hashtable has to iterate this loop a lot.
1483 while ((curElem = segp[segment_ndx]) == NULL)
1485 /* empty bucket, advance to next */
1486 if (++curBucket > max_bucket)
1490 return NULL;
/* search is done */
1492 if (++segment_ndx >= ssize)
1496 segp = hashp->
dir[segment_num];
1500 /* Begin scan of curBucket... */
1502 if (status->
curEntry == NULL)
/* end of this bucket */
1517 * Freeze a hashtable against future insertions (deletions are
1520 * The reason for doing this is that by preventing any more bucket splits,
1521 * we no longer need to worry about registering hash_seq_search scans,
1522 * and thus caller need not be careful about ensuring hash_seq_term gets
1523 * called at the right times.
1525 * Multiple calls to hash_freeze() are allowed, but you can't freeze a table
1526 * with active scans (since hash_seq_term would then do the wrong thing).
1534 elog(
ERROR,
"cannot freeze hashtable \"%s\" because it has active scans",
1540/********************************* UTILITIES ************************/
1543 * Expand the table by adding one more hash bucket.
1564#ifdef HASH_STATISTICS
1569 new_segnum = new_bucket >> hashp->
sshift;
1570 new_segndx =
MOD(new_bucket, hashp->
ssize);
1572 if (new_segnum >= hctl->
nsegs)
1574 /* Allocate new segment if necessary -- could fail if dir full */
1575 if (new_segnum >= hctl->
dsize)
1583 /* OK, we created a new bucket */
1587 * *Before* changing masks, find old bucket corresponding to same hash
1588 * values; values in that bucket may need to be relocated to new bucket.
1589 * Note that new_bucket is certainly larger than low_mask at this point,
1590 * so we can skip the first step of the regular hash mask calc.
1592 old_bucket = (new_bucket & hctl->
low_mask);
1595 * If we crossed a power of 2, readjust masks.
1604 * Relocate records to the new bucket. NOTE: because of the way the hash
1605 * masking is done in calc_bucket, only one old bucket can need to be
1606 * split at this point. With a different way of reducing the hash value,
1607 * that might not be true!
1609 old_segnum = old_bucket >> hashp->
sshift;
1610 old_segndx =
MOD(old_bucket, hashp->
ssize);
1612 old_seg = hashp->
dir[old_segnum];
1613 new_seg = hashp->
dir[new_segnum];
1615 oldlink = &old_seg[old_segndx];
1616 newlink = &new_seg[new_segndx];
1618 for (currElement = *oldlink;
1619 currElement != NULL;
1620 currElement = nextElement)
1622 nextElement = currElement->
link;
1625 *oldlink = currElement;
1626 oldlink = &currElement->
link;
1630 *newlink = currElement;
1631 newlink = &currElement->
link;
1634 /* don't forget to terminate the rebuilt hash chains... */
1654 /* Reallocate directory */
1665 memcpy(p, old_p, old_dirsize);
1666 MemSet(((
char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
1670 /* XXX assume the allocator is palloc, so we know how to free */
1698 * allocate some new elements and link them into the indicated free list
1715 /* Each element has a HASHELEMENT header plus user data. */
1718 requestSize = nelem * elementSize;
1720 /* Add space for slist_node list link if we need one. */
1726 /* Allocate the memory. */
1728 allocedBlock = hashp->
alloc(requestSize);
1734 * If USE_VALGRIND, each allocated block of elements of a non-shared
1735 * hashtable is chained into a list, so that Valgrind won't think it's
1750 /* prepare to link all the new entries into the freelist */
1752 tmpElement = firstElement;
1753 for (
i = 0;
i < nelem;
i++)
1755 tmpElement->
link = prevElement;
1756 prevElement = tmpElement;
1757 tmpElement = (
HASHELEMENT *) (((
char *) tmpElement) + elementSize);
1760 /* if partitioned, must lock to touch freeList */
1764 /* freelist could be nonempty if two backends did this concurrently */
1775 * Do initial lookup of a bucket for the given hash value, retrieving its
1776 * bucket number and its hash bucket.
1789 segment_num = bucket >> hashp->
sshift;
1790 segment_ndx =
MOD(bucket, hashp->
ssize);
1792 segp = hashp->
dir[segment_num];
1797 *bucketptr = &segp[segment_ndx];
1801/* complain when we have detected a corrupted hashtable */
1806 * If the corruption is in a shared hashtable, we'd better force a
1807 * systemwide restart. Otherwise, just shut down this one backend.
1815/* calculate ceil(log base 2) of num */
1820 * guard against too-large input, which would be invalid for
1829/* calculate first power of 2 >= num, bounded to what will fit in a int64 */
1833 /* my_log2's internal range check is sufficient */
1837/* calculate first power of 2 >= num, bounded to what will fit in an int */
1841 if (num > INT_MAX / 2)
1847/************************* SEQ SCAN TRACKING ************************/
1850 * We track active hash_seq_search scans here. The need for this mechanism
1851 * comes from the fact that a scan will get confused if a bucket split occurs
1852 * while it's in progress: it might visit entries twice, or even miss some
1853 * entirely (if it's partway through the same bucket that splits). Hence
1854 * we want to inhibit bucket splits if there are any active scans on the
1855 * table being inserted into. This is a fairly rare case in current usage,
1856 * so just postponing the split until the next insertion seems sufficient.
1858 * Given present usages of the function, only a few scans are likely to be
1859 * open concurrently; so a finite-size stack of open scans seems sufficient,
1860 * and we don't worry that linear search is too slow. Note that we do
1861 * allow multiple scans of the same hashtable to be open concurrently.
1863 * This mechanism can support concurrent scan and insertion in a shared
1864 * hashtable if it's the same backend doing both. It would fail otherwise,
1865 * but locking reasons seem to preclude any such scenario anyway, so we don't
1868 * This arrangement is reasonably robust if a transient hashtable is deleted
1869 * without notifying us. The absolute worst case is we might inhibit splits
1870 * in another table created later at exactly the same address. We will give
1871 * a warning at transaction end for reference leaks, so any bugs leading to
1872 * lack of notification should be easy to catch.
1875 #define MAX_SEQ_SCANS 100
1882/* Register a table as having an active hash_seq_search scan */
1887 elog(
ERROR,
"too many active hash_seq_search scans, cannot start one on \"%s\"",
1894/* Deregister an active scan */
1900 /* Search backward since it's most likely at the stack top */
1911 elog(
ERROR,
"no hash_seq_search scan for hash table \"%s\"",
1915/* Check if a table has any active scan */
1929/* Clean up any open scans at end of transaction */
1934 * During abort cleanup, open scans are expected; just silently clean 'em
1935 * out. An open scan at commit means someone forgot a hash_seq_term()
1936 * call, so complain.
1938 * Note: it's tempting to try to print the tabname here, but refrain for
1939 * fear of touching deallocated memory. This isn't a user-facing message
1940 * anyway, so it needn't be pretty.
1948 elog(
WARNING,
"leaked hash_seq_search scan for hash table %p",
1955/* Clean up any open scans at end of subtransaction */
1962 * Search backward to make cleanup easy. Note we must check all entries,
1963 * not only those at the end of the array, because deletion technique
1964 * doesn't keep them in order.
1971 elog(
WARNING,
"leaked hash_seq_search scan for hash table %p",
#define MemSet(start, val, len)
void(* pg_funcptr_t)(void)
static HTAB * seq_scan_tables[MAX_SEQ_SCANS]
static int seq_scan_level[MAX_SEQ_SCANS]
void hash_seq_init_with_hash_value(HASH_SEQ_STATUS *status, HTAB *hashp, uint32 hashvalue)
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
#define ELEMENT_FROM_KEY(key)
static void * DynaHashAlloc(Size size)
static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx)
void AtEOXact_HashTables(bool isCommit)
Size hash_estimate_size(int64 num_entries, Size entrysize)
static HASHSEGMENT seg_alloc(HTAB *hashp)
static MemoryContext CurrentDynaHashCxt
static int next_pow2_int(int64 num)
static int choose_nelem_alloc(Size entrysize)
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Size hash_get_shared_size(HASHCTL *info, int flags)
static void register_seq_scan(HTAB *hashp)
#define IS_PARTITIONED(hctl)
#define DEF_SEGSIZE_SHIFT
void AtEOSubXact_HashTables(bool isCommit, int nestDepth)
static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx)
void hash_destroy(HTAB *hashp)
static int string_compare(const char *key1, const char *key2, Size keysize)
int64 hash_select_dirsize(int64 num_entries)
void * hash_search_with_hash_value(HTAB *hashp, const void *keyPtr, uint32 hashvalue, HASHACTION action, bool *foundPtr)
void * hash_seq_search(HASH_SEQ_STATUS *status)
static bool expand_table(HTAB *hashp)
static void hdefault(HTAB *hashp)
static void deregister_seq_scan(HTAB *hashp)
static int my_log2(int64 num)
#define ELEMENTKEY(helem)
void hash_seq_term(HASH_SEQ_STATUS *status)
int64 hash_get_num_entries(HTAB *hashp)
static int64 next_pow2_int64(int64 num)
#define FREELIST_IDX(hctl, hashcode)
void hash_stats(const char *caller, HTAB *hashp)
static bool init_htab(HTAB *hashp, int64 nelem)
static pg_noreturn void hash_corrupted(HTAB *hashp)
void hash_freeze(HTAB *hashp)
static bool dir_realloc(HTAB *hashp)
bool hash_update_hash_key(HTAB *hashp, void *existingEntry, const void *newKeyPtr)
static uint32 hash_initial_lookup(HTAB *hashp, uint32 hashvalue, HASHBUCKET **bucketptr)
uint32 get_hash_value(HTAB *hashp, const void *keyPtr)
static uint32 calc_bucket(HASHHDR *hctl, uint32 hash_val)
static bool has_seq_scans(HTAB *hashp)
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
#define MCXT_ALLOC_NO_OOM
uint32 tag_hash(const void *key, Size keysize)
uint32 uint32_hash(const void *key, Size keysize)
uint32 string_hash(const void *key, Size keysize)
Assert(PointerIsAligned(start, uint64))
void *(* HashAllocFunc)(Size request)
int(* HashCompareFunc)(const void *key1, const void *key2, Size keysize)
uint32(* HashValueFunc)(const void *key, Size keysize)
void *(* HashCopyFunc)(void *dest, const void *src, Size keysize)
static void slist_push_head(slist_head *head, slist_node *node)
void * MemoryContextAlloc(MemoryContext context, Size size)
void pfree(void *pointer)
MemoryContext TopMemoryContext
void * MemoryContextAllocExtended(MemoryContext context, Size size, int flags)
void MemoryContextDelete(MemoryContext context)
void MemoryContextSetIdentifier(MemoryContext context, const char *id)
#define MemoryContextIsValid(context)
#define AllocSetContextCreate
#define ALLOCSET_DEFAULT_SIZES
static uint64 pg_ceil_log2_64(uint64 num)
size_t strlcpy(char *dst, const char *src, size_t siz)
Size add_size(Size s1, Size s2)
Size mul_size(Size s1, Size s2)
#define SpinLockInit(lock)
#define SpinLockRelease(lock)
#define SpinLockAcquire(lock)
struct HASHELEMENT * link
FreeListData freeList[NUM_FREELISTS]
int GetCurrentTransactionNestLevel(void)