1/*-------------------------------------------------------------------------
4 * Implementation of Margo Seltzer's Hashing package for postgres.
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/hash/hash.c
14 * This file contains only the public interface routines.
16 *-------------------------------------------------------------------------
33#include "utils/fmgrprotos.h"
37/* Working state for hashbuild and its callback */
41 double indtuples;
/* # tuples accepted into index */
54 * Hash handler function: return IndexAmRoutine with access method parameters
119 * hashbuild() -- build a new hash index.
133 * We expect to be called exactly once for any index relation. If that's
134 * not the case, big trouble's what we have.
137 elog(
ERROR,
"index \"%s\" already contains data",
140 /* Estimate the number of rows currently present in the table */
143 /* Initialize the hash index metadata page and initial buckets */
147 * If we just insert the tuples into the index in scan order, then
148 * (assuming their hash codes are pretty random) there will be no locality
149 * of access to the index, and if the index is bigger than available RAM
150 * then we'll thrash horribly. To prevent that scenario, we can sort the
151 * tuples by (expected) bucket number. However, such a sort is useless
152 * overhead when the index does fit in RAM. We choose to sort if the
153 * initial index size exceeds maintenance_work_mem, or the number of
154 * buffers usable for the index, whichever is less. (Limiting by the
155 * number of buffers should reduce thrashing between PG buffers and kernel
156 * buffers, which seems useful even if no physical I/O results. Limiting
157 * by maintenance_work_mem is useful to allow easy testing of the sort
158 * code path, and may be useful to DBAs as an additional control knob.)
160 * NOTE: this test will need adjustment if a bucket is ever different from
161 * one page. Also, "initial index size" accounting does not include the
162 * metapage, nor the first bitmap page.
165 if (
index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
170 if (num_buckets >= sort_threshold)
173 buildstate.
spool = NULL;
175 /* prepare to build the index */
179 /* do the heap scan */
186 if (buildstate.
spool)
188 /* sort the tuples and insert them into the index */
205 * hashbuildempty() -- build an empty hash index in the initialization fork
214 * Per-tuple callback for table_index_build_scan
225 Datum index_values[1];
226 bool index_isnull[1];
229 /* convert data to a hash key; on failure, do not insert anything */
232 index_values, index_isnull))
235 /* Either spool the tuple for sorting, or just put it into the index */
236 if (buildstate->
spool)
237 _h_spool(buildstate->
spool, tid, index_values, index_isnull);
240 /* form an index tuple and point it at the heap tuple */
242 index_values, index_isnull);
252 * hashinsert() -- insert an index tuple into a hash table.
254 * Hash on the heap tuple's key, form an index tuple with hash code.
255 * Find the appropriate location for the new tuple, and put it there.
264 Datum index_values[1];
265 bool index_isnull[1];
268 /* convert data to a hash key; on failure, do not insert anything */
271 index_values, index_isnull))
274 /* form an index tuple and point it at the heap tuple */
276 itup->
t_tid = *ht_ctid;
287 * hashgettuple() -- Get the next tuple in the scan.
295 /* Hash indexes are always lossy since we store only the hash code */
299 * If we've already initialized this scan, we can just advance it in the
300 * appropriate direction. If we haven't done so yet, we call a routine to
301 * get the first item in the scan.
308 * Check to see if we should kill the previously-fetched tuple.
313 * Yes, so remember it for later. (We'll deal with all such tuples
314 * at once right after leaving the index page or at end of scan.)
315 * In case if caller reverses the indexscan direction it is quite
316 * possible that the same item might get entered multiple times.
317 * But, we don't detect that; instead, we just forget any excess
329 * Now continue the scan.
339 * hashgetbitmap() -- get all tuples at once
356 * _hash_first and _hash_next handle eliminate dead index entries
357 * whenever scan->ignore_killed_tuples is true. Therefore, there's
358 * nothing to do here except add the results to the TIDBitmap.
371 * hashbeginscan() -- start a scan on a hash index
379 /* no order by operators allowed */
401 * hashrescan() -- rescan an index relation
405 ScanKey orderbys,
int norderbys)
412 /* Before leaving current page, deal with any killed items */
419 /* set position invalid (this will cause _hash_first call) */
422 /* Update scan key, if a new one is given */
431 * hashendscan() -- close down a scan
441 /* Before leaving current page, deal with any killed items */
455 * Bulk deletion of all index entries pointing to a set of heap tuples.
456 * The set of target tuples is specified via a callback routine that tells
457 * whether any given heap tuple (identified by ItemPointer) is being deleted.
459 * This function also deletes the tuples that are moved by split to other
462 * Result: a palloc'd struct containing statistical info for VACUUM displays.
469 double tuples_removed;
470 double num_index_tuples;
480 num_index_tuples = 0;
483 * We need a copy of the metapage so that we can use its hashm_spares[]
484 * values to compute bucket page addresses, but a cached copy should be
485 * good enough. (If not, we'll detect that further down and refresh the
486 * cache as necessary.)
489 Assert(cachedmetap != NULL);
494 /* Scan the buckets that we know exist */
496 cur_maxbucket = orig_maxbucket;
499 while (cur_bucket <= cur_maxbucket)
507 bool split_cleanup =
false;
509 /* Get address of bucket's start page */
512 blkno = bucket_blkno;
515 * We need to acquire a cleanup lock on the primary bucket page to out
516 * wait concurrent scans before deleting the dead tuples.
526 * If the bucket contains tuples that are moved by split, then we need
527 * to delete such tuples. We can't delete such tuples if the split
528 * operation on bucket is not finished as those are needed by scans.
533 split_cleanup =
true;
536 * This bucket might have been split since we last held a lock on
537 * the metapage. If so, hashm_maxbucket, hashm_highmask and
538 * hashm_lowmask might be old enough to cause us to fail to remove
539 * tuples left behind by the most recent split. To prevent that,
540 * now that the primary page of the target bucket has been locked
541 * (and thus can't be further split), check whether we need to
542 * update our cached metapage data.
548 Assert(cachedmetap != NULL);
558 &num_index_tuples, split_cleanup,
563 /* Advance to next bucket */
570 /* Write-lock metapage and check for split since we started */
576 /* There's been a split, so process the additional bucket(s) */
579 Assert(cachedmetap != NULL);
584 /* Okay, we're really done. Update tuple count in metapage. */
591 * No one has split or inserted anything since start of scan, so
592 * believe our count as gospel.
599 * Otherwise, our count is untrustworthy since we may have
600 * double-scanned tuples in split buckets. Proceed by dead-reckoning.
601 * (Note: we still return estimated_count = false, because using this
602 * count is better than not updating reltuples at all.)
634 /* return statistics */
640 /* hashvacuumcleanup will fill in num_pages */
646 * Post-VACUUM cleanup.
648 * Result: a palloc'd struct containing statistical info for VACUUM displays.
656 /* If hashbulkdelete wasn't called, return NULL signifying no change */
657 /* Note: this covers the analyze_only case too */
661 /* update statistics */
669 * Helper function to perform deletion of index entries from a bucket.
671 * This function expects that the caller has acquired a cleanup lock on the
672 * primary bucket page, and will return with a write lock again held on the
673 * primary bucket page. The lock won't necessarily be held continuously,
674 * though, because we'll release it when visiting overflow pages.
676 * There can't be any concurrent scans in progress when we first enter this
677 * function because of the cleanup lock we hold on the primary bucket page,
678 * but as soon as we release that lock, there might be. If those scans got
679 * ahead of our cleanup scan, they might see a tuple before we kill it and
680 * wake up only after VACUUM has completed and the TID has been recycled for
681 * an unrelated tuple. To avoid that calamity, we prevent scans from passing
682 * our cleanup scan by locking the next page in the bucket chain before
683 * releasing the lock on the previous page. (This type of lock chaining is not
684 * ideal, so we might want to look for a better solution at some point.)
686 * We need to retain a pin on the primary bucket to ensure that no concurrent
693 double *tuples_removed,
double *num_index_tuples,
700 bool bucket_dirty =
false;
702 blkno = bucket_blkno;
709 /* Scan each page in bucket */
719 bool retain_pin =
false;
720 bool clear_dead_marking =
false;
727 /* Scan each tuple in page */
736 bool kill_tuple =
false;
740 htup = &(itup->
t_tid);
743 * To remove the dead tuples, we strictly want to rely on results
744 * of callback function. refer btvacuumpage for detailed reason.
750 *tuples_removed += 1;
752 else if (split_cleanup)
754 /* delete the tuples that are moved by split. */
759 /* mark the item for deletion */
760 if (bucket != cur_bucket)
763 * We expect tuples to either belong to current bucket or
764 * new_bucket. This is ensured because we don't allow
765 * further splits from bucket that contains garbage. See
766 * comments in _hash_expandtable.
768 Assert(bucket == new_bucket);
775 /* mark the item for deletion */
776 deletable[ndeletable++] = offno;
780 /* we're keeping it, so count it */
781 if (num_index_tuples)
782 *num_index_tuples += 1;
786 /* retain the pin on primary bucket page till end of bucket scan */
787 if (blkno == bucket_blkno)
795 * Apply deletions, advance to next page and write page if needed.
799 /* No ereport(ERROR) until changes are logged */
806 * Let us mark the page as clean if vacuum removes the DEAD tuples
807 * from an index page. We do this by clearing
808 * LH_PAGE_HAS_DEAD_TUPLES flag.
810 if (tuples_removed && *tuples_removed > 0 &&
813 opaque->
hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
814 clear_dead_marking =
true;
832 * bucket buffer was not changed, but still needs to be
833 * registered to ensure that we can acquire a cleanup lock on
854 /* bail out if there are no more pages to scan. */
863 * release the lock on previous page after acquiring the lock on next
875 * lock the bucket page to clear the garbage flag and squeeze the bucket.
876 * if the current buffer is same as bucket buffer, then we already have
877 * lock on bucket page.
879 if (
buf != bucket_buf)
886 * Clear the garbage flag from bucket after deleting the tuples that are
887 * moved by split. We purposefully clear the flag before squeeze bucket,
888 * so that after restart, vacuum shouldn't again try to delete the moved
899 /* No ereport(ERROR) until changes are logged */
902 bucket_opaque->
hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
921 * If we have deleted anything, try to compact free space. For squeezing
922 * the bucket, we must have a cleanup lock, else it can impact the
923 * ordering of tuples for a scan that has started before it.
void pgstat_progress_update_param(int index, int64 val)
#define InvalidBlockNumber
static bool BlockNumberIsValid(BlockNumber blockNumber)
static Datum values[MAXATTR]
#define BufferIsInvalid(buffer)
bool IsBufferCleanupOK(Buffer buffer)
void MarkBufferDirty(Buffer buffer)
void LockBufferForCleanup(Buffer buffer)
void LockBuffer(Buffer buffer, int mode)
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
#define BUFFER_LOCK_UNLOCK
#define RelationGetNumberOfBlocks(reln)
static Page BufferGetPage(Buffer buffer)
#define BUFFER_LOCK_EXCLUSIVE
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
static void PageSetLSN(Page page, XLogRecPtr lsn)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
#define PG_USED_FOR_ASSERTS_ONLY
#define PG_RETURN_POINTER(x)
IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
bool hashinsert(Relation rel, Datum *values, bool *isnull, ItemPointer ht_ctid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)
IndexBulkDeleteResult * hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
IndexBuildResult * hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
Datum hashhandler(PG_FUNCTION_ARGS)
bool hashgettuple(IndexScanDesc scan, ScanDirection dir)
CompareType hashtranslatestrategy(StrategyNumber strategy, Oid opfamily)
static void hashbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
void hashbuildempty(Relation index)
IndexScanDesc hashbeginscan(Relation rel, int nkeys, int norderbys)
StrategyNumber hashtranslatecmptype(CompareType cmptype, Oid opfamily)
IndexBulkDeleteResult * hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
void hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)
void hashendscan(IndexScanDesc scan)
int64 hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
void hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, BlockNumber bucket_blkno, BufferAccessStrategy bstrategy, uint32 maxbucket, uint32 highmask, uint32 lowmask, double *tuples_removed, double *num_index_tuples, bool split_cleanup, IndexBulkDeleteCallback callback, void *callback_state)
#define HashPageGetOpaque(page)
#define H_BUCKET_BEING_SPLIT(opaque)
#define HashScanPosInvalidate(scanpos)
#define HashPageGetMeta(page)
#define BUCKET_TO_BLKNO(metap, B)
#define H_HAS_DEAD_TUPLES(opaque)
#define H_NEEDS_SPLIT_CLEANUP(opaque)
HashScanOpaqueData * HashScanOpaque
#define HashScanPosIsValid(scanpos)
#define XLOG_HASH_SPLIT_CLEANUP
#define XLOG_HASH_UPDATE_META_PAGE
#define SizeOfHashUpdateMetaPage
Assert(PointerIsAligned(start, uint64))
void _hash_doinsert(Relation rel, IndexTuple itup, Relation heapRel, bool sorted)
void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, BufferAccessStrategy bstrategy)
HashMetaPage _hash_getcachedmetap(Relation rel, Buffer *metabuf, bool force_refresh)
void _hash_relbuf(Relation rel, Buffer buf)
uint32 _hash_init(Relation rel, double num_tuples, ForkNumber forkNum)
void _hash_dropbuf(Relation rel, Buffer buf)
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, int access, int flags, BufferAccessStrategy bstrategy)
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
void _h_spool(HSpool *hspool, ItemPointer self, const Datum *values, const bool *isnull)
void _h_indexbuild(HSpool *hspool, Relation heapRel)
HSpool * _h_spoolinit(Relation heap, Relation index, uint32 num_buckets)
void _h_spooldestroy(HSpool *hspool)
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
Bucket _hash_hashkey2bucket(uint32 hashkey, uint32 maxbucket, uint32 highmask, uint32 lowmask)
void _hash_checkpage(Relation rel, Buffer buf, int flags)
Bucket _hash_get_newbucket_from_oldbucket(Relation rel, Bucket old_bucket, uint32 lowmask, uint32 maxbucket)
bytea * hashoptions(Datum reloptions, bool validate)
void _hash_kill_items(IndexScanDesc scan)
bool _hash_convert_tuple(Relation index, Datum *user_values, bool *user_isnull, Datum *index_values, bool *index_isnull)
void hashadjustmembers(Oid opfamilyoid, Oid opclassoid, List *operators, List *functions)
bool hashvalidate(Oid opclassoid)
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
if(TABLE==NULL||TABLE_index==NULL)
IndexTupleData * IndexTuple
#define MaxIndexTuplesPerPage
void pfree(void *pointer)
void * palloc0(Size size)
#define START_CRIT_SECTION()
#define END_CRIT_SECTION()
#define OffsetNumberNext(offsetNumber)
#define FirstOffsetNumber
void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples, double *allvisfrac)
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
#define RelationGetDescr(relation)
#define RelationGetRelationName(relation)
#define RelationNeedsWAL(relation)
void hashcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)
#define HTMaxStrategyNumber
#define HTEqualStrategyNumber
BlockNumber hasho_nextblkno
BlockNumber hasho_prevblkno
bool hashso_buc_populated
Buffer hashso_split_bucket_buf
HashScanPosItem items[MaxIndexTuplesPerPage]
ambuildphasename_function ambuildphasename
ambuildempty_function ambuildempty
amvacuumcleanup_function amvacuumcleanup
amoptions_function amoptions
amestimateparallelscan_function amestimateparallelscan
amrestrpos_function amrestrpos
aminsert_function aminsert
amendscan_function amendscan
amtranslate_strategy_function amtranslatestrategy
amparallelrescan_function amparallelrescan
bool amconsistentordering
amtranslate_cmptype_function amtranslatecmptype
amcostestimate_function amcostestimate
amadjustmembers_function amadjustmembers
amgettuple_function amgettuple
amcanreturn_function amcanreturn
amgetbitmap_function amgetbitmap
amproperty_function amproperty
ambulkdelete_function ambulkdelete
amvalidate_function amvalidate
ammarkpos_function ammarkpos
bool amusemaintenanceworkmem
ambeginscan_function ambeginscan
amrescan_function amrescan
aminitparallelscan_function aminitparallelscan
uint8 amparallelvacuumoptions
aminsertcleanup_function aminsertcleanup
amgettreeheight_function amgettreeheight
bool amconsistentequality
struct ScanKeyData * keyData
BufferAccessStrategy strategy
bool is_primary_bucket_page
static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck)
void vacuum_delay_point(bool is_analyze)
#define VACUUM_OPTION_PARALLEL_BULKDEL
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
void XLogRegisterData(const void *data, uint32 len)
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
void XLogBeginInsert(void)