1/*-------------------------------------------------------------------------
4 * Generic hashing functions, and hash functions for use in dynahash.c
8 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
16 * It is expected that every bit of a hash function's 32-bit result is
17 * as random as every other; failure to ensure this is likely to lead
18 * to poor performance of hash tables. In most cases a hash
19 * function should use hash_bytes() or its variant hash_bytes_uint32(),
20 * or the wrappers hash_any() and hash_uint32 defined in hashfn.h.
22 *-------------------------------------------------------------------------
31 * This hash function was written by Bob Jenkins
32 * (bob_jenkins@burtleburtle.net), and superficially adapted
33 * for PostgreSQL by Neil Conway. For more information on this
34 * hash function, see http://burtleburtle.net/bob/hash/doobs.html,
35 * or Bob's article in Dr. Dobb's Journal, Sept. 1997.
37 * In the current code, we have adopted Bob's 2006 update of his hash
38 * function to fetch the data a word at a time when it is suitably aligned.
39 * This makes for a useful speedup, at the cost of having to maintain
40 * four code paths (aligned vs unaligned, and little-endian vs big-endian).
41 * It also uses two separate mixing functions mix() and final(), instead
42 * of a slower multi-purpose function.
45/* Get a bit mask of the bits set in non-uint32 aligned addresses */
46 #define UINT32_ALIGN_MASK (sizeof(uint32) - 1)
48 #define rot(x,k) pg_rotate_left32(x, k)
51 * mix -- mix 3 32-bit values reversibly.
53 * This is reversible, so any information in (a,b,c) before mix() is
54 * still in (a,b,c) after mix().
56 * If four pairs of (a,b,c) inputs are run through mix(), or through
57 * mix() in reverse, there are at least 32 bits of the output that
58 * are sometimes the same for one pair and different for another pair.
59 * This was tested for:
60 * * pairs that differed by one bit, by two bits, in any combination
61 * of top bits of (a,b,c), or in any combination of bottom bits of
63 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
64 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
65 * is commonly produced by subtraction) look like a single 1-bit
67 * * the base values were pseudorandom, all zero but one bit set, or
68 * all zero plus a counter that starts at zero.
70 * This does not achieve avalanche. There are input bits of (a,b,c)
71 * that fail to affect some output bits of (a,b,c), especially of a. The
72 * most thoroughly mixed value is c, but it doesn't really even achieve
75 * This allows some parallelism. Read-after-writes are good at doubling
76 * the number of bits affected, so the goal of mixing pulls in the opposite
77 * direction from the goal of parallelism. I did what I could. Rotates
78 * seem to cost as much as shifts on every machine I could lay my hands on,
79 * and rotates are much kinder to the top and bottom bits, so I used rotates.
84 a -= c; a ^= rot(c, 4); c += b; \
85 b -= a; b ^= rot(a, 6); a += c; \
86 c -= b; c ^= rot(b, 8); b += a; \
87 a -= c; a ^= rot(c,16); c += b; \
88 b -= a; b ^= rot(a,19); a += c; \
89 c -= b; c ^= rot(b, 4); b += a; \
93 * final -- final mixing of 3 32-bit values (a,b,c) into c
95 * Pairs of (a,b,c) values differing in only a few bits will usually
96 * produce values of c that look totally different. This was tested for
97 * * pairs that differed by one bit, by two bits, in any combination
98 * of top bits of (a,b,c), or in any combination of bottom bits of
100 * * "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
101 * the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
102 * is commonly produced by subtraction) look like a single 1-bit
104 * * the base values were pseudorandom, all zero but one bit set, or
105 * all zero plus a counter that starts at zero.
107 * The use of separate functions for mix() and final() allow for a
108 * substantial performance increase since final() does not need to
109 * do well in reverse, but is does need to affect all output bits.
110 * mix(), on the other hand, does not need to affect all output
111 * bits (affecting 32 bits is enough). The original hash function had
112 * a single mixing operation that had to satisfy both sets of requirements
113 * and was slower as a result.
116 #define final(a,b,c) \
118 c ^= b; c -= rot(b,14); \
119 a ^= c; a -= rot(c,11); \
120 b ^= a; b -= rot(a,25); \
121 c ^= b; c -= rot(b,16); \
122 a ^= c; a -= rot(c, 4); \
123 b ^= a; b -= rot(a,14); \
124 c ^= b; c -= rot(b,24); \
128 * hash_bytes() -- hash a variable-length key into a 32-bit value
129 * k : the key (the unaligned variable-length array of bytes)
130 * len : the length of the key, counting by bytes
132 * Returns a uint32 value. Every bit of the key affects every bit of
133 * the return value. Every 1-bit and 2-bit delta achieves avalanche.
134 * About 6*len+35 instructions. The best hash table sizes are powers
135 * of 2. There is no need to do mod a prime (mod is sooo slow!).
136 * If you need less than 32 bits, use a bitmask.
138 * This procedure must never throw elog(ERROR); the ResourceOwner code
139 * relies on this not to fail.
141 * Note: we could easily change this function to return a 64-bit hash value
142 * by using the final values of both b and c. b is perhaps a little less
143 * well mixed than c, however.
153 /* Set up the internal state */
155 a =
b =
c = 0x9e3779b9 +
len + 3923095;
157 /* If the source pointer is word-aligned, we use word-wide fetches */
160 /* Code path for aligned source data */
163 /* handle most of the key */
174 /* handle the last 11 bytes */
175 k = (
const unsigned char *) ka;
176#ifdef WORDS_BIGENDIAN
189 /* the lowest byte of c is reserved for the length */
213 /* case 0: nothing left to add */
215#else /* !WORDS_BIGENDIAN */
228 /* the lowest byte of c is reserved for the length */
252 /* case 0: nothing left to add */
254#endif /* WORDS_BIGENDIAN */
258 /* Code path for non-aligned source data */
260 /* handle most of the key */
263#ifdef WORDS_BIGENDIAN
267#else /* !WORDS_BIGENDIAN */
271#endif /* WORDS_BIGENDIAN */
277 /* handle the last 11 bytes */
278#ifdef WORDS_BIGENDIAN
291 /* the lowest byte of c is reserved for the length */
314 /* case 0: nothing left to add */
316#else /* !WORDS_BIGENDIAN */
329 /* the lowest byte of c is reserved for the length */
352 /* case 0: nothing left to add */
354#endif /* WORDS_BIGENDIAN */
359 /* report the result */
364 * hash_bytes_extended() -- hash into a 64-bit value, using an optional seed
365 * k : the key (the unaligned variable-length array of bytes)
366 * len : the length of the key, counting by bytes
367 * seed : a 64-bit seed (0 means no seed)
369 * Returns a uint64 value. Otherwise similar to hash_bytes.
379 /* Set up the internal state */
381 a =
b =
c = 0x9e3779b9 +
len + 3923095;
383 /* If the seed is non-zero, use it to perturb the internal state. */
387 * In essence, the seed is treated as part of the data being hashed,
388 * but for simplicity, we pretend that it's padded with four bytes of
389 * zeroes so that the seed constitutes a 12-byte chunk.
396 /* If the source pointer is word-aligned, we use word-wide fetches */
399 /* Code path for aligned source data */
402 /* handle most of the key */
413 /* handle the last 11 bytes */
414 k = (
const unsigned char *) ka;
415#ifdef WORDS_BIGENDIAN
428 /* the lowest byte of c is reserved for the length */
452 /* case 0: nothing left to add */
454#else /* !WORDS_BIGENDIAN */
467 /* the lowest byte of c is reserved for the length */
491 /* case 0: nothing left to add */
493#endif /* WORDS_BIGENDIAN */
497 /* Code path for non-aligned source data */
499 /* handle most of the key */
502#ifdef WORDS_BIGENDIAN
506#else /* !WORDS_BIGENDIAN */
510#endif /* WORDS_BIGENDIAN */
516 /* handle the last 11 bytes */
517#ifdef WORDS_BIGENDIAN
530 /* the lowest byte of c is reserved for the length */
553 /* case 0: nothing left to add */
555#else /* !WORDS_BIGENDIAN */
568 /* the lowest byte of c is reserved for the length */
591 /* case 0: nothing left to add */
593#endif /* WORDS_BIGENDIAN */
598 /* report the result */
603 * hash_bytes_uint32() -- hash a 32-bit value to a 32-bit value
605 * This has the same result as
606 * hash_bytes(&k, sizeof(uint32))
607 * but is faster and doesn't force the caller to store k into memory.
621 /* report the result */
626 * hash_bytes_uint32_extended() -- hash 32-bit value to 64-bit value, with seed
628 * Like hash_bytes_uint32, this is a convenience function.
650 /* report the result */
655 * string_hash: hash function for keys that are NUL-terminated strings.
657 * NOTE: this is the default hash function if none is specified.
663 * If the string exceeds keysize-1 bytes, we want to hash only that many,
664 * because when it is copied into the hash table it will be truncated at
667 Size s_len = strlen((
const char *)
key);
669 s_len =
Min(s_len, keysize - 1);
674 * tag_hash: hash function for fixed-size tag values
679 return hash_bytes((
const unsigned char *)
key, (
int) keysize);
683 * uint32_hash: hash function for keys that are uint32 or int32
685 * (tag_hash works for this case too, but is slower)
uint32 hash_bytes_uint32(uint32 k)
uint64 hash_bytes_extended(const unsigned char *k, int keylen, uint64 seed)
uint32 hash_bytes(const unsigned char *k, int keylen)
uint32 tag_hash(const void *key, Size keysize)
uint64 hash_bytes_uint32_extended(uint32 k, uint64 seed)
#define UINT32_ALIGN_MASK
uint32 uint32_hash(const void *key, Size keysize)
uint32 string_hash(const void *key, Size keysize)
Assert(PointerIsAligned(start, uint64))