1/*-------------------------------------------------------------------------
4 * search code for postgres hash tables
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/hash/hashsearch.c
13 *-------------------------------------------------------------------------
34 * _hash_next() -- Get the next item in a scan.
36 * On entry, so->currPos describes the current page, which may
37 * be pinned but not locked, and so->currPos.itemIndex identifies
38 * which item was previously returned.
40 * On successful exit, scan->xs_heaptid is set to the TID of the next
41 * heap tuple. so->currPos is updated as needed.
43 * On failure exit (no more tuples), we return false with pin
44 * held on bucket page but no pins or locks held on overflow
55 bool end_of_scan =
false;
58 * Advance to the next tuple on the current page; or if done, try to read
59 * data from the next or previous page based on the scan direction. Before
60 * moving to the next or previous page make sure that we deal with all the
95 * We always maintain the pin on bucket page for whole scan
96 * operation, so releasing the additional pin we have acquired
118 /* OK, itemIndex says what to return */
126 * Advance to next page in a bucket, if any. If we are scanning the bucket
127 * being populated during split operation then this function advances to the
128 * bucket being split after the last bucket page of bucket being populated.
137 bool block_found =
false;
139 blkno = (*opaquep)->hasho_nextblkno;
142 * Retain the pin on primary bucket page till the end of scan. Refer the
143 * comments in _hash_first to know the reason of retaining pin.
151 /* check for interrupts while we're not holding any buffer lock */
161 * end of bucket, scan bucket being split if there was a split in
162 * progress at the start of scan.
167 * buffer for bucket being split must be valid as we acquire the pin
168 * on it before the start of scan and retain it till end of scan.
176 * setting hashso_buc_split to true indicates that we are scanning
177 * bucket being split.
192 * Advance to previous page in a bucket, if any. If the current scan has
193 * started during split operation then this function advances to bucket
194 * being populated after the first bucket page of bucket being split.
205 blkno = (*opaquep)->hasho_prevblkno;
208 * Retain the pin on primary bucket page till the end of scan. Refer the
209 * comments in _hash_first to know the reason of retaining pin.
223 /* check for interrupts while we're not holding any buffer lock */
235 * We always maintain the pin on bucket page for whole scan operation,
236 * so releasing the additional pin we have acquired here.
244 * end of bucket, scan bucket being populated if there was a split in
245 * progress at the start of scan.
250 * buffer for bucket being populated must be valid as we acquire the
251 * pin on it before the start of scan and retain it till end of scan.
259 /* move to the end of bucket chain */
264 * setting hashso_buc_split to false indicates that we are scanning
265 * bucket being populated.
272 * _hash_first() -- Find the first item in a scan.
274 * We find the first item (or, if backward scan, the last item) in the
275 * index that satisfies the qualification associated with the scan
278 * On successful exit, if the page containing current index tuple is an
279 * overflow page, both pin and lock are released whereas if it is a bucket
280 * page then it is pinned but not locked and data about the matching
281 * tuple(s) on the page has been loaded into so->currPos,
282 * scan->xs_heaptid is set to the heap TID of the current tuple.
284 * On failure exit (no more tuples), we return false, with pin held on
285 * bucket page but no pins or locks held on overflow page.
305 * We do not support hash scans with no index qualification, because we
306 * would have to read the whole index rather than just one bucket. That
307 * creates a whole raft of problems, since we haven't got a practical way
308 * to lock all the buckets against splits or compactions.
312 (
errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
313 errmsg(
"hash indexes do not support whole-index scans")));
315 /* There may be more than one index qual, but we hash only the first */
318 /* We support only single-column hash indexes */
320 /* And there's only one operator strategy, too */
324 * If the constant in the index qual is NULL, assume it cannot match any
325 * items in the index.
331 * Okay to compute the hash key. We want to do this before acquiring any
332 * locks, in case a user-defined hash function happens to be slow.
334 * If scankey operator is not a cross-type comparison, we can use the
335 * cached hash function; otherwise gotta look it up in the catalogs.
337 * We support the convention that sk_subtype == InvalidOid means the
338 * opclass input type; this is a hack to simplify life for ScanKeyInit().
353 bucket = opaque->hasho_bucket;
358 * If a bucket split is in progress, then while scanning the bucket being
359 * populated, we need to skip tuples that were copied from bucket being
360 * split. We also need to maintain a pin on the bucket being split to
361 * ensure that split-cleanup work done by vacuum doesn't remove tuples
362 * from it till this scan is done. We need to maintain a pin on the
363 * bucket being populated to ensure that vacuum doesn't squeeze that
364 * bucket till this scan is complete; otherwise, the ordering of tuples
365 * can't be maintained during forward and backward scans. Here, we have
366 * to be cautious about locking order: first, acquire the lock on bucket
367 * being split; then, release the lock on it but not the pin; then,
368 * acquire a lock on bucket being populated and again re-verify whether
369 * the bucket split is still in progress. Acquiring the lock on bucket
370 * being split first ensures that the vacuum waits for this scan to
381 * release the lock on new bucket and re-acquire it after acquiring
382 * the lock on old bucket.
389 * remember the split bucket buffer so as to use it later for
398 Assert(opaque->hasho_bucket == bucket);
409 /* If a backwards scan is requested, move to the end of the chain */
413 * Backward scans that start during split needs to start from end of
414 * bucket being split.
421 /* remember which buffer we have pinned, if any */
425 /* Now find all the tuples satisfying the qualification from a page */
429 /* OK, itemIndex says what to return */
433 /* if we're here, _hash_readpage found a valid tuples */
438 * _hash_readpage() -- Load data from current index page into so->currPos
440 * We scan all the items in the current index page and save them into
441 * so->currPos if it satisfies the qualification. If no matching items
442 * are found in the current page, we move to the next or previous page
443 * in a bucket chain as indicated by the direction.
445 * Return true if any matching items are found else return false.
473 /* new page, locate starting position by binary search */
482 * Could not find any matching tuples in the current page, move to
483 * the next page. Before leaving the current page, deal with any
490 * If this is a primary bucket page, hasho_prevblkno is not a real
497 prev_blkno = opaque->hasho_prevblkno;
508 * Remember next and previous block numbers for scrollable
509 * cursors to know the start position and return false
510 * indicating that no more matching tuples were found. Also,
511 * don't reset currPage or lsn, because we expect
512 * _hash_kill_items to be called for the old page after this
532 /* new page, locate starting position by binary search */
541 * Could not find any matching tuples in the current page, move to
542 * the previous page. Before leaving the current page, deal with
550 next_blkno = opaque->hasho_nextblkno;
561 * Remember next and previous block numbers for scrollable
562 * cursors to know the start position and return false
563 * indicating that no more matching tuples were found. Also,
564 * don't reset currPage or lsn, because we expect
565 * _hash_kill_items to be called for the old page after this
600 * Load all the qualified items from a current index page
601 * into so->currPos. Helper function for _hash_readpage.
616 /* load items[] in ascending order */
619 while (offnum <= maxoff)
625 * skip the tuples that are moved by split operation for the scan
626 * that has started when split was in progress. Also, skip the
627 * tuples that are marked as dead.
641 /* tuple is qualified, so remember it */
648 * No more matching tuples exist in this page. so, exit while
662 /* load items[] in descending order */
671 * skip the tuples that are moved by split operation for the scan
672 * that has started when split was in progress. Also, skip the
673 * tuples that are marked as dead.
688 /* tuple is qualified, so remember it */
694 * No more matching tuples exist in this page. so, exit while
708/* Save an index item into so->currPos.items[itemIndex] */
#define InvalidBlockNumber
static bool BlockNumberIsValid(BlockNumber blockNumber)
#define BufferIsInvalid(buffer)
BlockNumber BufferGetBlockNumber(Buffer buffer)
void LockBuffer(Buffer buffer, int mode)
#define BUFFER_LOCK_UNLOCK
#define BUFFER_LOCK_SHARE
static Page BufferGetPage(Buffer buffer)
static bool BufferIsValid(Buffer bufnum)
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
#define HashPageGetOpaque(page)
#define HashScanPosInvalidate(scanpos)
#define H_BUCKET_BEING_POPULATED(opaque)
#define INDEX_MOVED_BY_SPLIT_MASK
HashScanOpaqueData * HashScanOpaque
Assert(PointerIsAligned(start, uint64))
void _hash_relbuf(Relation rel, Buffer buf)
void _hash_dropbuf(Relation rel, Buffer buf)
void _hash_dropscanbuf(Relation rel, HashScanOpaque so)
Buffer _hash_getbuf(Relation rel, BlockNumber blkno, int access, int flags)
Buffer _hash_getbucketbuf_from_hashkey(Relation rel, uint32 hashkey, int access, HashMetaPage *cachedmetap)
bool _hash_first(IndexScanDesc scan, ScanDirection dir)
static void _hash_saveitem(HashScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup)
static void _hash_readnext(IndexScanDesc scan, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
bool _hash_next(IndexScanDesc scan, ScanDirection dir)
static bool _hash_readpage(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
static void _hash_readprev(IndexScanDesc scan, Buffer *bufp, Page *pagep, HashPageOpaque *opaquep)
static int _hash_load_qualified_items(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir)
uint32 _hash_get_indextuple_hashkey(IndexTuple itup)
bool _hash_checkqual(IndexScanDesc scan, IndexTuple itup)
OffsetNumber _hash_binsearch(Page page, uint32 hash_value)
uint32 _hash_datum2hashkey(Relation rel, Datum key)
OffsetNumber _hash_binsearch_last(Page page, uint32 hash_value)
void _hash_checkpage(Relation rel, Buffer buf, int flags)
BlockNumber _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
uint32 _hash_datum2hashkey_type(Relation rel, Datum key, Oid keytype)
void _hash_kill_items(IndexScanDesc scan)
if(TABLE==NULL||TABLE_index==NULL)
#define ItemIdIsDead(itemId)
IndexTupleData * IndexTuple
#define MaxIndexTuplesPerPage
#define CHECK_FOR_INTERRUPTS()
#define OffsetNumberNext(offsetNumber)
#define FirstOffsetNumber
#define OffsetNumberPrev(offsetNumber)
#define pgstat_count_index_scan(rel)
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
#define ScanDirectionIsForward(direction)
#define ScanDirectionIsBackward(direction)
#define HTEqualStrategyNumber
bool hashso_buc_populated
Buffer hashso_split_bucket_buf
HashScanPosItem items[MaxIndexTuplesPerPage]
struct ScanKeyData * keyData
bool ignore_killed_tuples
struct IndexScanInstrumentation * instrument
ItemPointerData xs_heaptid
struct SnapshotData * xs_snapshot