1/*-------------------------------------------------------------------------
4 * HyperLogLog cardinality estimator
6 * Portions Copyright (c) 2014-2025, PostgreSQL Global Development Group
8 * Based on Hideaki Ohno's C++ implementation. This is probably not ideally
9 * suited to estimating the cardinality of very large sets; in particular, we
10 * have not attempted to further optimize the implementation as described in
11 * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
12 * Engineering of a State of The Art Cardinality Estimation Algorithm".
14 * A sparse representation of HyperLogLog state is used, with fixed space
17 * The copyright terms of Ohno's original version (the MIT license) follow.
20 * src/backend/lib/hyperloglog.c
22 *-------------------------------------------------------------------------
26 * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
28 * Permission is hereby granted, free of charge, to any person obtaining a copy
29 * of this software and associated documentation files (the 'Software'), to
30 * deal in the Software without restriction, including without limitation the
31 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
32 * sell copies of the Software, and to permit persons to whom the Software is
33 * furnished to do so, subject to the following conditions:
35 * The above copyright notice and this permission notice shall be included in
36 * all copies or substantial portions of the Software.
38 * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
39 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
40 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
41 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
42 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
43 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
54 #define POW_2_32 (4294967296.0)
55 #define NEG_POW_2_32 (-4294967296.0)
60 * Initialize HyperLogLog track state, by bit width
62 * bwidth is bit width (so register size will be 2 to the power of bwidth).
63 * Must be between 4 and 16 inclusive.
70 if (bwidth < 4 || bwidth > 16)
71 elog(
ERROR,
"bit width must be between 4 and 16 inclusive");
78 * Initialize hashes array to zero, not negative infinity, per discussion
79 * of the coupon collector problem in the HyperLogLog paper
84 * "alpha" is a value that for each possible number of registers (m) is
85 * used to correct a systematic multiplicative bias present in m ^ 2 Z (Z
86 * is "the indicator function" through which we finally compute E,
87 * estimated cardinality).
101 alpha = 0.7213 / (1.0 + 1.079 / cState->
nRegisters);
105 * Precalculate alpha m ^ 2, later used to generate "raw" HyperLogLog
112 * Initialize HyperLogLog track state, by error rate
114 * Instead of specifying bwidth (number of bits used for addressing the
115 * register), this method allows sizing the counter for particular error
116 * rate using a simple formula from the paper:
120 * where 'm' is the number of registers, i.e. (2^bwidth). The method
121 * finds the lowest bwidth with 'e' below the requested error rate, and
122 * then uses it to initialize the counter.
124 * As bwidth has to be between 4 and 16, the worst possible error rate
125 * is between ~25% (bwidth=4) and 0.4% (bwidth=16).
134 double m = (
Size) 1 << bwidth;
136 if (1.04 / sqrt(m) <
error)
145 * Free HyperLogLog track state
147 * Releases allocated resources, but not the state itself (in case it's not
148 * allocated by palloc).
158 * Adds element to the estimator, from caller-supplied hash.
160 * It is critical that the hash value passed be an actual hash value, typically
161 * generated using hash_any(). The algorithm relies on a specific bit-pattern
162 * observable in conjunction with stochastic averaging. There must be a
163 * uniform distribution of bits in hash values for each distinct original value
172 /* Use the first "k" (registerWidth) bits as a zero based index */
175 /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
183 * Estimates cardinality, based on elements added so far
197 /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
198 result = cState->
alphaMM / sum;
200 if (result <= (5.0 / 2.0) * cState->
nRegisters)
202 /* Small range correction */
215 else if (result > (1.0 / 30.0) *
POW_2_32)
217 /* Large range correction */
225 * Worker for addHyperLogLog().
227 * Calculates the position of the first set bit in first b bits of x argument
228 * starting from the first, reading from most significant to least significant
231 * Example (when considering fist 10 bits of x):
233 * rho(x = 0b1000000000) returns 1
234 * rho(x = 0b0010000000) returns 3
235 * rho(x = 0b0000000000) returns b + 1
237 * "The binary address determined by the first b bits of x"
239 * Return value "j" used to index bit pattern to watch.
Assert(PointerIsAligned(start, uint64))
void initHyperLogLog(hyperLogLogState *cState, uint8 bwidth)
void initHyperLogLogError(hyperLogLogState *cState, double error)
static uint8 rho(uint32 x, uint8 b)
double estimateHyperLogLog(hyperLogLogState *cState)
void addHyperLogLog(hyperLogLogState *cState, uint32 hash)
void freeHyperLogLog(hyperLogLogState *cState)
void pfree(void *pointer)
void * palloc0(Size size)
static int pg_leftmost_one_pos32(uint32 word)
static unsigned hash(unsigned *uv, int n)