1/*-------------------------------------------------------------------------
4 * Utility code for Postgres hash implementation.
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/hash/hashutil.c
13 *-------------------------------------------------------------------------
24 #define CALC_NEW_BUCKET(old_bucket, lowmask) \
25 old_bucket | (lowmask + 1)
28 * _hash_checkqual -- does the index tuple satisfy the scan conditions?
34 * Currently, we can't check any of the scan conditions since we do not
35 * have the original index entry value to supply to the sk_func. Always
36 * return true; we expect that hashgettuple already set the recheck flag
37 * to make the main indexscan code do it.
44 while (scanKeySize > 0)
55 /* assume sk_func is strict */
62 datum,
key->sk_argument);
76 * _hash_datum2hashkey -- given a Datum, call the index's hash function
78 * The Datum is assumed to be of the index's column type, so we can use the
79 * "primary" hash function that's tracked for us by the generic index code.
87 /* XXX assumes index has only one attribute */
95 * _hash_datum2hashkey_type -- given a Datum of a specified type,
96 * hash it in a fashion compatible with this index
98 * This is much more expensive than _hash_datum2hashkey, so use it only in
99 * cross-type situations.
107 /* XXX assumes index has only one attribute */
113 elog(
ERROR,
"missing support function %d(%u,%u) for index \"%s\"",
122 * _hash_hashkey2bucket -- determine which bucket the hashkey maps to.
130 bucket = hashkey & highmask;
131 if (bucket > maxbucket)
132 bucket = bucket & lowmask;
138 * _hash_spareindex -- returns spare index / global splitpoint phase of the
150 return splitpoint_group;
152 /* account for single-phase groups */
155 /* account for multi-phase groups before splitpoint_group */
160 /* account for phases within current group */
162 (((num_bucket - 1) >>
166 return splitpoint_phases;
170 * _hash_get_totalbuckets -- returns total number of buckets allocated till
171 * the given splitpoint phase.
178 uint32 phases_within_splitpoint_group;
181 return (1 << splitpoint_phase);
183 /* get splitpoint's group */
189 /* account for buckets before splitpoint_group */
190 total_buckets = (1 << (splitpoint_group - 1));
192 /* account for buckets within splitpoint_group */
193 phases_within_splitpoint_group =
198 phases_within_splitpoint_group);
200 return total_buckets;
204 * _hash_checkpage -- sanity checks on the format of all hash pages
206 * If flags is not zero, it is a bitwise OR of the acceptable page types
207 * (values of hasho_flag & LH_PAGE_TYPE).
215 * ReadBuffer verifies that every newly-read page passes
216 * PageHeaderIsValid, which means it either contains a reasonably sane
217 * page header or is all-zero. We have to defend against the all-zero
222 (
errcode(ERRCODE_INDEX_CORRUPTED),
223 errmsg(
"index \"%s\" contains unexpected zero page at block %u",
226 errhint(
"Please REINDEX it.")));
229 * Additionally check that the special area looks sane.
233 (
errcode(ERRCODE_INDEX_CORRUPTED),
234 errmsg(
"index \"%s\" contains corrupted page at block %u",
237 errhint(
"Please REINDEX it.")));
245 (
errcode(ERRCODE_INDEX_CORRUPTED),
246 errmsg(
"index \"%s\" contains corrupted page at block %u",
249 errhint(
"Please REINDEX it.")));
253 * When checking the metapage, also verify magic number and version.
261 (
errcode(ERRCODE_INDEX_CORRUPTED),
262 errmsg(
"index \"%s\" is not a hash index",
267 (
errcode(ERRCODE_INDEX_CORRUPTED),
268 errmsg(
"index \"%s\" has wrong hash version",
270 errhint(
"Please REINDEX it.")));
288 * _hash_get_indextuple_hashkey - get the hash index tuple's hash key value
296 * We assume the hash key is the first attribute and can't be null, so
297 * this can be done crudely but very very cheaply ...
300 return *((
uint32 *) attp);
304 * _hash_convert_tuple - convert raw index data to hash key
306 * Inputs: values and isnull arrays for the user data column(s)
307 * Outputs: values and isnull arrays for the index tuple, suitable for
308 * passing to index_form_tuple().
310 * Returns true if successful, false if not (because there are null values).
311 * On a false result, the given data need not be indexed.
313 * Note: callers know that the index-column arrays are always of length 1.
314 * In principle, there could be more than one input column, though we do not
315 * currently support that.
319 Datum *user_values,
bool *user_isnull,
320 Datum *index_values,
bool *index_isnull)
325 * We do not insert null values into hash indexes. This is okay because
326 * the only supported search operator is '=', and we assume it is strict.
333 index_isnull[0] =
false;
338 * _hash_binsearch - Return the offset number in the page where the
339 * specified hash value should be sought or inserted.
341 * We use binary search, relying on the assumption that the existing entries
342 * are ordered by hash key.
344 * Returns the offset of the first index entry having hashkey >= hash_value,
345 * or the page's max offset plus one if hash_value is greater than all
346 * existing hash keys in the page. This is the appropriate place to start
347 * a search, or to insert a new item.
355 /* Loop invariant: lower <= desired place <= upper */
370 if (hashkey < hash_value)
380 * _hash_binsearch_last
382 * Same as above, except that if there are multiple matching items in the
383 * page, we return the offset of the last one instead of the first one,
384 * and the possible range of outputs is 0..maxoffset not 1..maxoffset+1.
385 * This is handy for starting a new page in a backwards scan.
393 /* Loop invariant: lower <= desired place <= upper */
408 if (hashkey > hash_value)
418 * _hash_get_oldblock_from_newbucket() -- get the block number of a bucket
419 * from which current (new) bucket is being split.
431 * To get the old bucket from the current bucket, we need a mask to modulo
432 * into lower half of table. This mask is stored in meta page as
433 * hashm_lowmask, but here we can't rely on the same, because we need a
434 * value of lowmask that was prevalent at the time when bucket split was
435 * started. Masking the most significant bit of new bucket would give us
439 old_bucket = new_bucket & mask;
452 * _hash_get_newblock_from_oldbucket() -- get the block number of a bucket
453 * that will be generated after split from old bucket.
455 * This is used to find the new bucket from old bucket based on current table
456 * half. It is mainly required to finish the incomplete splits where we are
457 * sure that not more than one bucket could have split in progress from old
482 * _hash_get_newbucket_from_oldbucket() -- get the new bucket that will be
483 * generated after split from current (old) bucket.
485 * This is used to find the new bucket from old bucket. New bucket can be
486 * obtained by OR'ing old bucket with most significant bit of current table
487 * half (lowmask passed in this function can be used to identify msb of
488 * current table half). There could be multiple buckets that could have
489 * been split from current bucket. We need the first such bucket that exists.
490 * Caller must ensure that no more than one split has happened from old
500 if (new_bucket > maxbucket)
502 lowmask = lowmask >> 1;
510 * _hash_kill_items - set LP_DEAD state for items an indexscan caller has
511 * told us were killed.
513 * scan->opaque, referenced locally through so, contains information about the
514 * current page and killed tuples thereon (generally, this should only be
515 * called if so->numKilled > 0).
517 * The caller does not have a lock on the page and may or may not have the
518 * page pinned in a buffer. Note that read-lock is sufficient for setting
519 * LP_DEAD status (which is only a hint).
521 * The caller must have pin on bucket buffer, but may or may not have pin
522 * on overflow buffer, as indicated by HashScanPosIsPinned(so->currPos).
524 * We match items by heap TID before assuming they are the right ones to
527 * There are never any scans active in a bucket at the time VACUUM begins,
528 * because VACUUM takes a cleanup lock on the primary bucket page and scans
529 * hold a pin. A scan can begin after VACUUM leaves the primary bucket page
530 * but before it finishes the entire bucket, but it can never pass VACUUM,
531 * because VACUUM always locks the next page before releasing the lock on
532 * the previous one. Therefore, we don't have to worry about accidentally
533 * killing a TID that has been reused for an unrelated tuple.
548 bool killedsomething =
false;
549 bool havePin =
false;
556 * Always reset the scan state, so we don't look for same items on other
565 * We already have pin on this buffer, so, all we need to do is
566 * acquire lock on it.
579 for (
i = 0;
i < numKilled;
i++)
587 itemIndex <= so->currPos.lastItem);
589 while (offnum <= maxoff)
598 killedsomething =
true;
599 break;
/* out of inner search loop */
606 * Since this can be redone later if needed, mark as dirty hint. Whenever
607 * we mark anything LP_DEAD, we also set the page's
608 * LH_PAGE_HAS_DEAD_TUPLES flag, which is likewise just a hint.
static bool validate(Port *port, const char *auth)
BlockNumber BufferGetBlockNumber(Buffer buffer)
void LockBuffer(Buffer buffer, int mode)
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
#define BUFFER_LOCK_UNLOCK
#define BUFFER_LOCK_SHARE
static Page BufferGetPage(Buffer buffer)
static uint16 PageGetSpecialSize(const PageData *page)
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
static bool PageIsNew(const PageData *page)
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
#define RegProcedureIsValid(p)
int errhint(const char *fmt,...)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
Datum FunctionCall2Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2)
Datum OidFunctionCall1Coll(Oid functionId, Oid collation, Datum arg1)
Datum FunctionCall1Coll(FmgrInfo *flinfo, Oid collation, Datum arg1)
#define HashPageGetOpaque(page)
#define HASHSTANDARD_PROC
#define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE
#define HashScanPosIsPinned(scanpos)
#define HashPageGetMeta(page)
#define BUCKET_TO_BLKNO(metap, B)
#define HASH_SPLITPOINT_PHASE_MASK
HashScanOpaqueData * HashScanOpaque
#define HashScanPosIsValid(scanpos)
#define HASH_SPLITPOINT_PHASE_BITS
#define LH_PAGE_HAS_DEAD_TUPLES
Assert(PointerIsAligned(start, uint64))
void _hash_relbuf(Relation rel, Buffer buf)
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
uint32 _hash_spareindex(uint32 num_bucket)
BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
uint32 _hash_get_totalbuckets(uint32 splitpoint_phase)
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
OffsetNumber _hash_binsearch(Page page, uint32 hash_value)
uint32 _hash_datum2hashkey(Relation rel, Datum key)
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value)
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
bytea * hashoptions(Datum reloptions, bool validate)
BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
void _hash_kill_items(IndexScanDesc scan)
#define CALC_NEW_BUCKET(old_bucket, lowmask)
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)
#define ItemIdMarkDead(itemId)
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
IndexTupleData * IndexTuple
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
static Size IndexInfoFindDataOffset(unsigned short t_info)
Oid get_opfamily_proc(Oid opfamily, Oid lefttype, Oid righttype, int16 procnum)
#define OffsetNumberIsValid(offsetNumber)
#define OffsetNumberNext(offsetNumber)
#define FirstOffsetNumber
Datum lower(PG_FUNCTION_ARGS)
Datum upper(PG_FUNCTION_ARGS)
static int pg_leftmost_one_pos32(uint32 word)
static uint32 pg_ceil_log2_32(uint32 num)
static uint32 DatumGetUInt32(Datum X)
static bool DatumGetBool(Datum X)
static Datum UInt32GetDatum(uint32 X)
#define RelationGetDescr(relation)
#define RelationGetRelationName(relation)
void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)
HashScanPosItem items[MaxIndexTuplesPerPage]
struct ScanKeyData * keyData