1/*-------------------------------------------------------------------------
4 * header file for postgres hash access method implementation
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/include/access/hash.h
13 * modeled after Margo Seltzer's hash implementation for unix.
15 *-------------------------------------------------------------------------
23#include "catalog/pg_am_d.h"
32 * Mapping from hash bucket number to physical block number of bucket's
33 * starting page. Beware of multiple evaluations of argument!
37 #define InvalidBucket ((Bucket) 0xFFFFFFFF)
39 #define BUCKET_TO_BLKNO(metap,B) \
40 ((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_spareindex((B)+1)-1] : 0)) + 1)
43 * Special space for hash index pages.
45 * hasho_flag's LH_PAGE_TYPE bits tell us which type of page we're looking at.
46 * Additional bits in the flag word are used for more transient purposes.
48 * To test a page's type, do (hasho_flag & LH_PAGE_TYPE) == LH_xxx_PAGE.
49 * However, we ensure that each used page type has a distinct bit so that
50 * we can OR together page types for uses such as the allowable-page-types
51 * argument of _hash_checkpage().
53 #define LH_UNUSED_PAGE (0)
54 #define LH_OVERFLOW_PAGE (1 << 0)
55 #define LH_BUCKET_PAGE (1 << 1)
56 #define LH_BITMAP_PAGE (1 << 2)
57 #define LH_META_PAGE (1 << 3)
58 #define LH_BUCKET_BEING_POPULATED (1 << 4)
59 #define LH_BUCKET_BEING_SPLIT (1 << 5)
60 #define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6)
61 #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7)
63 #define LH_PAGE_TYPE \
64 (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE)
67 * In an overflow page, hasho_prevblkno stores the block number of the previous
68 * page in the bucket chain; in a bucket page, hasho_prevblkno stores the
69 * hashm_maxbucket value as of the last time the bucket was last split, or
70 * else as of the time the bucket was created. The latter convention is used
71 * to determine whether a cached copy of the metapage is too stale to be used
72 * without needing to lock or pin the metapage.
74 * hasho_nextblkno is always the block number of the next page in the
75 * bucket chain, or InvalidBlockNumber if there are no more such pages.
88 #define HashPageGetOpaque(page) ((HashPageOpaque) PageGetSpecialPointer(page))
90 #define H_NEEDS_SPLIT_CLEANUP(opaque) (((opaque)->hasho_flag & LH_BUCKET_NEEDS_SPLIT_CLEANUP) != 0)
91 #define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0)
92 #define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0)
93 #define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0)
96 * The page ID is for the convenience of pg_filedump and similar utilities,
97 * which otherwise would have a hard time telling pages of different index
98 * types apart. It should be the last 2 bytes on the page. This is more or
99 * less "free" due to alignment considerations.
101 #define HASHO_PAGE_ID 0xFF80
117 * The items array is always ordered in index order (ie, increasing
118 * indexoffset). When scanning backwards it is convenient to fill the
119 * array back-to-front, so we start at the last slot and fill downwards.
120 * Hence we need both a first-valid-entry and a last-valid-entry counter.
121 * itemIndex is a cursor showing which entry was last returned to caller.
130 #define HashScanPosIsPinned(scanpos) \
132 AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
133 !BufferIsValid((scanpos).buf)), \
134 BufferIsValid((scanpos).buf) \
137 #define HashScanPosIsValid(scanpos) \
139 AssertMacro(BlockNumberIsValid((scanpos).currPage) || \
140 !BufferIsValid((scanpos).buf)), \
141 BlockNumberIsValid((scanpos).currPage) \
144 #define HashScanPosInvalidate(scanpos) \
146 (scanpos).buf = InvalidBuffer; \
147 (scanpos).currPage = InvalidBlockNumber; \
148 (scanpos).nextPage = InvalidBlockNumber; \
149 (scanpos).prevPage = InvalidBlockNumber; \
150 (scanpos).firstItem = 0; \
151 (scanpos).lastItem = 0; \
152 (scanpos).itemIndex = 0; \
156 * HashScanOpaqueData is private state for a hash index scan.
160 /* Hash value of the scan key, ie, the hash key we seek */
163 /* remember the buffer associated with primary bucket */
167 * remember the buffer associated with primary bucket page of bucket being
168 * split. it is required during the scan of the bucket which is being
169 * populated during split operation.
173 /* Whether scan starts on bucket being populated due to split */
177 * Whether scanning bucket being split? The value of this parameter is
178 * referred only when hashso_buc_populated is true.
181 /* info about killed items if any (killedItems is NULL if never used) */
186 * Identify all the matching items on a page and save them in
195 * Definitions for metapage.
198 #define HASH_METAPAGE 0 /* metapage is always block 0 */
200 #define HASH_MAGIC 0x6440640
201 #define HASH_VERSION 4
204 * spares[] holds the number of overflow pages currently allocated at or
205 * before a certain splitpoint. For example, if spares[3] = 7 then there are
206 * 7 ovflpages before splitpoint 3 (compare BUCKET_TO_BLKNO macro). The
207 * value in spares[ovflpoint] increases as overflow pages are added at the
208 * end of the index. Once ovflpoint increases (ie, we have actually allocated
209 * the bucket pages belonging to that splitpoint) the number of spares at the
210 * prior splitpoint cannot change anymore.
212 * ovflpages that have been recycled for reuse can be found by looking at
213 * bitmaps that are stored within ovflpages dedicated for the purpose.
214 * The blknos of these bitmap pages are kept in mapp[]; nmaps is the
215 * number of currently existing bitmaps.
217 * The limitation on the size of spares[] comes from the fact that there's
218 * no point in having more than 2^32 buckets with only uint32 hashcodes.
219 * (Note: The value of HASH_MAX_SPLITPOINTS which is the size of spares[] is
220 * adjusted in such a way to accommodate multi phased allocation of buckets
221 * after HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE).
223 * There is no particular upper limit on the size of mapp[], other than
224 * needing to fit into the metapage. (With 8K block size, 1024 bitmaps
225 * limit us to 256 GB of overflow space...). For smaller block size we
226 * can not use 1024 bitmaps as it will lead to the meta page data crossing
227 * the block size boundary. So we use BLCKSZ to determine the maximum number
230 #define HASH_MAX_BITMAPS Min(BLCKSZ / 8, 1024)
232 #define HASH_SPLITPOINT_PHASE_BITS 2
233 #define HASH_SPLITPOINT_PHASES_PER_GRP (1 << HASH_SPLITPOINT_PHASE_BITS)
234 #define HASH_SPLITPOINT_PHASE_MASK (HASH_SPLITPOINT_PHASES_PER_GRP - 1)
235 #define HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE 10
237/* defines max number of splitpoint phases a hash index can have */
238 #define HASH_MAX_SPLITPOINT_GROUP 32
239 #define HASH_MAX_SPLITPOINTS \
240 (((HASH_MAX_SPLITPOINT_GROUP - HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE) * \
241 HASH_SPLITPOINT_PHASES_PER_GRP) + \
242 HASH_SPLITPOINT_GROUPS_WITH_ONE_PHASE)
275 #define HashGetFillFactor(relation) \
276 (AssertMacro(relation->rd_rel->relkind == RELKIND_INDEX && \
277 relation->rd_rel->relam == HASH_AM_OID), \
278 (relation)->rd_options ? \
279 ((HashOptions *) (relation)->rd_options)->fillfactor : \
280 HASH_DEFAULT_FILLFACTOR)
281 #define HashGetTargetPageUsage(relation) \
282 (BLCKSZ * HashGetFillFactor(relation) / 100)
285 * Maximum size of a hash index item (it's okay to have only one per page)
287 #define HashMaxItemSize(page) \
288 MAXALIGN_DOWN(PageGetPageSize(page) - \
289 SizeOfPageHeaderData - \
290 sizeof(ItemIdData) - \
291 MAXALIGN(sizeof(HashPageOpaqueData)))
293 #define INDEX_MOVED_BY_SPLIT_MASK INDEX_AM_RESERVED_BIT
295 #define HASH_MIN_FILLFACTOR 10
296 #define HASH_DEFAULT_FILLFACTOR 75
301 #define BYTE_TO_BIT 3 /* 2^3 bits/byte */
302 #define ALL_SET ((uint32) ~0)
305 * Bitmap pages do not contain tuples. They do contain the standard
306 * page headers and trailers; however, everything in between is a
307 * giant bit array. The number of bits that fit on a page obviously
308 * depends on the page size and the header/trailer overhead. We require
309 * the number of bits per page to be a power of 2.
311 #define BMPGSZ_BYTE(metap) ((metap)->hashm_bmsize)
312 #define BMPGSZ_BIT(metap) ((metap)->hashm_bmsize << BYTE_TO_BIT)
313 #define BMPG_SHIFT(metap) ((metap)->hashm_bmshift)
314 #define BMPG_MASK(metap) (BMPGSZ_BIT(metap) - 1)
316 #define HashPageGetBitmap(page) \
317 ((uint32 *) PageGetContents(page))
319 #define HashGetMaxBitmapSize(page) \
320 (PageGetPageSize((Page) page) - \
321 (MAXALIGN(SizeOfPageHeaderData) + MAXALIGN(sizeof(HashPageOpaqueData))))
323 #define HashPageGetMeta(page) \
324 ((HashMetaPage) PageGetContents(page))
327 * The number of bits in an ovflpage bitmap word.
329 #define BITS_PER_MAP 32 /* Number of bits in uint32 */
331/* Given the address of the beginning of a bit map, clear/set the nth bit */
332 #define CLRBIT(A, N) ((A)[(N)/BITS_PER_MAP] &= ~(1<<((N)%BITS_PER_MAP)))
333 #define SETBIT(A, N) ((A)[(N)/BITS_PER_MAP] |= (1<<((N)%BITS_PER_MAP)))
334 #define ISSET(A, N) ((A)[(N)/BITS_PER_MAP] & (1<<((N)%BITS_PER_MAP)))
337 * page-level and high-level locking modes (see README)
339 #define HASH_READ BUFFER_LOCK_SHARE
340 #define HASH_WRITE BUFFER_LOCK_EXCLUSIVE
341 #define HASH_NOLOCK (-1)
344 * When a new operator class is declared, we require that the user supply
345 * us with an amproc function for hashing a key of the new type, returning
346 * a 32-bit hash value. We call this the "standard" hash function. We
347 * also allow an optional "extended" hash function which accepts a salt and
348 * returns a 64-bit hash value. This is highly recommended but, for reasons
349 * of backward compatibility, optional.
351 * When the salt is 0, the low 32 bits of the value returned by the extended
352 * hash function should match the value that would have been returned by the
353 * standard hash function.
355 #define HASHSTANDARD_PROC 1
356 #define HASHEXTENDED_PROC 2
357 #define HASHOPTIONS_PROC 3
375 ScanKey orderbys,
int norderbys);
380 void *callback_state);
393/* private routines */
471 Datum *user_values,
bool *user_isnull,
472 Datum *index_values,
bool *index_isnull);
486 double *tuples_removed,
double *num_index_tuples,
static bool validate(Port *port, const char *auth)
static Datum values[MAXATTR]
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
void _h_spool(HSpool *hspool, ItemPointer self, const Datum *values, const bool *isnull)
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno)
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
HashPageOpaqueData * HashPageOpaque
void _hash_initbuf(Buffer buf, uint32 max_bucket, uint32 num_bucket, uint32 flag, bool initpage)
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
uint32 _hash_spareindex(uint32 num_bucket)
void _hash_relbuf(Relation rel, Buffer buf)
void _h_indexbuild(HSpool *hspool, Relation heapRel)
void hashadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
BlockNumber _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
uint32 _hash_get_totalbuckets(uint32 splitpoint_phase)
Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags)
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
struct HashScanPosData HashScanPosData
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
bool hashvalidate(Oid opclassoid)
bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
struct HashScanPosItem HashScanPosItem
OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup, bool appendtup)
#define HASH_MAX_SPLITPOINTS
OffsetNumber _hash_binsearch(Page page, uint32 hash_value)
CompareType hashtranslatestrategy(StrategyNumber strategy, Oid opfamily)
uint32 _hash_datum2hashkey(Relation rel, Datum key)
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value)
void _hash_checkpage(Relation rel, Buffer buf, int flags)
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
void _hash_pageinit(Page page, Size size)
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, struct IndexInfo *indexInfo)
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
void hashbuildempty(Relation index)
IndexBuildResult * hashbuild(Relation heap, Relation index, struct IndexInfo *indexInfo)
bytea * hashoptions(Datum reloptions, bool validate)
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, OffsetNumber *itup_offsets, uint16 nitups)
HashMetaPageData * HashMetaPage
uint32 _hash_ovflblkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno)
StrategyNumber hashtranslatecmptype(CompareType cmptype, Oid opfamily)
BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
void _hash_dropbuf(Relation rel, Buffer buf)
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy)
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage)
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
struct HashScanOpaqueData HashScanOpaqueData
struct HashMetaPageData HashMetaPageData
void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, uint32 maxbucket, uint32 highmask, uint32 lowmask)
Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, HashMetaPage *cachedmetap)
void _hash_kill_items(IndexScanDesc scan)
void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, uint16 ffactor, bool initpage)
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
void hashendscan(IndexScanDesc scan)
HashScanOpaqueData * HashScanOpaque
Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin)
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
void _hash_expandtable(Relation rel, Buffer metabuf)
void _h_spooldestroy(HSpool *hspool)
struct HashPageOpaqueData HashPageOpaqueData
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
struct HashOptions HashOptions
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
#define MaxIndexTuplesPerPage
static const struct fns functions
BlockNumber hashm_mapp[HASH_MAX_BITMAPS]
RegProcedure hashm_procid
uint32 hashm_spares[HASH_MAX_SPLITPOINTS]
BlockNumber hasho_nextblkno
BlockNumber hasho_prevblkno
bool hashso_buc_populated
Buffer hashso_split_bucket_buf
HashScanPosItem items[MaxIndexTuplesPerPage]
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)