1/*-------------------------------------------------------------------------
4 * Routines to support index-only scans
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/executor/nodeIndexonlyscan.c
13 *-------------------------------------------------------------------------
17 * ExecIndexOnlyScan scans an index
18 * IndexOnlyNext retrieve next tuple
19 * ExecInitIndexOnlyScan creates and initializes state info.
20 * ExecReScanIndexOnlyScan rescans the indexed relation.
21 * ExecEndIndexOnlyScan releases all storage.
22 * ExecIndexOnlyMarkPos marks scan position.
23 * ExecIndexOnlyRestrPos restores scan position.
24 * ExecIndexOnlyScanEstimate estimates DSM space needed for
25 * parallel index-only scan
26 * ExecIndexOnlyScanInitializeDSM initialize DSM for parallel
28 * ExecIndexOnlyScanReInitializeDSM reinitialize DSM for fresh scan
29 * ExecIndexOnlyScanInitializeWorker attach to DSM info in parallel worker
54/* ----------------------------------------------------------------
57 * Retrieve a tuple from the IndexOnlyScan node's index.
58 * ----------------------------------------------------------------
71 * extract necessary information from index scan node
76 * Determine which direction to scan the index in based on the plan's scan
77 * direction and the current direction of execution.
88 * We reach here if the index only scan is not parallel, or if we're
89 * serially executing an index only scan that was planned to be
102 /* Set it up for index-only scan */
107 * If no run-time keys to calculate or they are ready, go ahead and
108 * pass the scankeys to the index AM.
119 * OK, now that we have what we need, fetch the next tuple.
123 bool tuple_from_heap =
false;
128 * We can skip the heap fetch if the TID references a heap page on
129 * which all tuples are known visible to everybody. In any case,
130 * we'll use the index tuple not the heap tuple as the data source.
132 * Note on Memory Ordering Effects: visibilitymap_get_status does not
133 * lock the visibility map buffer, and therefore the result we read
134 * here could be slightly stale. However, it can't be stale enough to
137 * We need to detect clearing a VM bit due to an insert right away,
138 * because the tuple is present in the index page but not visible. The
139 * reading of the TID by this scan (using a shared lock on the index
140 * buffer) is serialized with the insert of the TID into the index
141 * (using an exclusive lock on the index buffer). Because the VM bit
142 * is cleared before updating the index, and locking/unlocking of the
143 * index page acts as a full memory barrier, we are sure to see the
144 * cleared bit if we see a recently-inserted TID.
146 * Deletes do not update the index page (only VACUUM will clear out
147 * the TID), so the clearing of the VM bit by a delete is not
148 * serialized with this test below, and we may see a value that is
149 * significantly stale. However, we don't care about the delete right
150 * away, because the tuple is still visible until the deleting
151 * transaction commits or the statement ends (if it's our
152 * transaction). In either case, the lock on the VM buffer will have
153 * been released (acting as a write barrier) after clearing the bit.
154 * And for us to have a snapshot that includes the deleting
155 * transaction (making the tuple invisible), we must have acquired
156 * ProcArrayLock after that time, acting as a read barrier.
158 * It's worth going through this complexity to avoid needing to lock
159 * the VM buffer, which could cause significant contention.
166 * Rats, we have to visit the heap to check visibility.
170 continue;
/* no visible tuple, try next index entry */
175 * Only MVCC snapshots are supported here, so there should be no
176 * need to keep following the HOT chain once a visible entry has
177 * been found. If we did want to allow that, we'd need to keep
178 * more state to remember not to call index_getnext_tid next time.
181 elog(
ERROR,
"non-MVCC snapshots are not supported in index-only scans");
184 * Note: at this point we are holding a pin on the heap page, as
185 * recorded in scandesc->xs_cbuf. We could release that pin now,
186 * but it's not clear whether it's a win to do so. The next index
187 * entry might require a visit to the same heap page.
190 tuple_from_heap =
true;
194 * Fill the scan tuple slot with data from the index. This might be
195 * provided in either HeapTuple or IndexTuple format. Conceivably an
196 * index AM might fill both fields, in which case we prefer the heap
197 * format, since it's probably a bit cheaper to fill a slot from.
202 * We don't take the trouble to verify that the provided tuple has
203 * exactly the slot's format, but it seems worth doing a quick
204 * check on the number of fields.
213 elog(
ERROR,
"no data returned for index-only scan");
216 * If the index was lossy, we have to recheck the index quals.
223 /* Fails recheck, so drop it and loop back for another */
230 * We don't currently support rechecking ORDER BY distances. (In
231 * principle, if the index can support retrieval of the originally
232 * indexed value, it should be able to produce an exact distance
233 * calculation too. So it's not clear that adding code here for
234 * recheck/re-sort would be worth the trouble. But we should at least
235 * throw an error if someone tries it.)
239 (
errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
240 errmsg(
"lossy distance functions are not supported in index-only scans")));
243 * If we didn't access the heap, then we'll need to take a predicate
244 * lock explicitly, as if we had. For now we do that at page level.
246 if (!tuple_from_heap)
255 * if we get here it means the index scan failed so we are at the end of
263 * Fill the slot with data from the index tuple.
265 * At some point this might be generally-useful functionality, but
266 * right now we don't need it elsewhere.
273 * Note: we must use the tupdesc supplied by the AM in index_deform_tuple,
274 * not the slot's tupdesc, in case the latter has different datatypes
275 * (this happens for btree name_ops in particular). They'd better have
276 * the same number of columns though, as well as being datatype-compatible
277 * which is something we can't so easily check.
285 * Copy all name columns stored as cstrings back into a NAMEDATALEN byte
286 * sized allocation. We mark this branch as unlikely as generally "name"
287 * is used only for the system catalogs and this would have to be a user
288 * query running on those or some other user table with an index on a name
300 /* skip null Datums */
304 /* allocate the NAMEDATALEN and copy the datum into that memory */
308 /* use namestrcpy to zero-pad all trailing bytes */
318 * IndexOnlyRecheck -- access method routine to recheck a tuple in EvalPlanQual
320 * This can't really happen, since an index can't supply CTID which would
321 * be necessary data for any potential EvalPlanQual target relation. If it
322 * did happen, the EPQ code would pass us the wrong data, namely a heap
323 * tuple not an index tuple. So throw an error.
328 elog(
ERROR,
"EvalPlanQual recheck is not supported in index-only scans");
329 return false;
/* keep compiler quiet */
332/* ----------------------------------------------------------------
333 * ExecIndexOnlyScan(node)
334 * ----------------------------------------------------------------
342 * If we have runtime keys and they've not already been set up, do it now.
352/* ----------------------------------------------------------------
353 * ExecReScanIndexOnlyScan(node)
355 * Recalculates the values of any scan keys whose value depends on
356 * information known at runtime, then rescans the indexed relation.
358 * Updating the scan key was formerly done separately in
359 * ExecUpdateIndexScanKeys. Integrating it into ReScan makes
360 * rescans of indices and relations/general streams more uniform.
361 * ----------------------------------------------------------------
367 * If we are doing runtime key calculations (ie, any of the index key
368 * values weren't simple Consts), compute the new key values. But first,
369 * reset the context so we don't leak memory as each outer tuple is
370 * scanned. Note this assumes that we will recalculate *all* runtime keys
384 /* reset index scan */
394/* ----------------------------------------------------------------
395 * ExecEndIndexOnlyScan
396 * ----------------------------------------------------------------
405 * extract information from the node
410 /* Release VM buffer pin, if any. */
418 * When ending a parallel worker, copy the statistics gathered by the
419 * worker back into shared memory so that it can be picked up by the main
420 * process to report in EXPLAIN ANALYZE
426 Assert(ParallelWorkerNumber <= node->ioss_SharedInfo->num_workers);
430 * We have to accumulate the stats rather than performing a memcpy.
431 * When a Gather/GatherMerge node finishes it will perform planner
432 * shutdown on the workers. On rescan it will spin up new workers
433 * which will have a new IndexOnlyScanState and zeroed stats.
439 * close the index relation (no-op if we didn't open it)
443 if (indexRelationDesc)
447/* ----------------------------------------------------------------
448 * ExecIndexOnlyMarkPos
450 * Note: we assume that no caller attempts to set a mark before having read
451 * at least one tuple. Otherwise, ioss_ScanDesc might still be NULL.
452 * ----------------------------------------------------------------
460 if (epqstate != NULL)
463 * We are inside an EvalPlanQual recheck. If a test tuple exists for
464 * this relation, then we shouldn't access the index at all. We would
465 * instead need to save, and later restore, the state of the
466 * relsubs_done flag, so that re-fetching the test tuple is possible.
467 * However, given the assumption that no caller sets a mark at the
468 * start of the scan, we can only get here with relsubs_done[i]
469 * already set, and so no state need be saved.
477 /* Verify the claim above */
479 elog(
ERROR,
"unexpected ExecIndexOnlyMarkPos call in EPQ recheck");
487/* ----------------------------------------------------------------
488 * ExecIndexOnlyRestrPos
489 * ----------------------------------------------------------------
499 /* See comments in ExecIndexMarkPos */
506 /* Verify the claim above */
508 elog(
ERROR,
"unexpected ExecIndexOnlyRestrPos call in EPQ recheck");
516/* ----------------------------------------------------------------
517 * ExecInitIndexOnlyScan
519 * Initializes the index scan's state information, creates
520 * scan keys, and opens the base and index relations.
522 * Note: index scans have 2 sets of state information because
523 * we have to keep track of the base relation and the
525 * ----------------------------------------------------------------
539 * create state structure
547 * Miscellaneous initialization
549 * create expression context for node
554 * open the scan relation
562 * Build the scan tuple type using the indextlist generated by the
563 * planner. We use this, rather than the index's physical tuple
564 * descriptor, because the latter contains storage column types not the
565 * types of the original datums. (It's the AM's responsibility to return
566 * suitable data anyway.)
573 * We need another slot, in a format that's suitable for the table AM, for
574 * when we need to fetch a tuple from the table for rechecking visibility.
582 * Initialize result type and projection info. The node's targetlist will
583 * contain Vars with varno = INDEX_VAR, referencing the scan tuple.
589 * initialize child expressions
591 * Note: we don't initialize all of the indexorderby expression, only the
592 * sub-parts corresponding to runtime keys (see below).
600 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
601 * here. This allows an index-advisor plugin to EXPLAIN a plan containing
602 * references to nonexistent indexes.
607 /* Open the index relation. */
613 * Initialize index-specific scan state
620 * build the index scan keys from the index qualification
630 NULL,
/* no ArrayKeys */
634 * any ORDER BY exprs have to be turned into scankeys in the same way
644 NULL,
/* no ArrayKeys */
648 * If we have runtime keys, we need an ExprContext to evaluate them. The
649 * node's standard context won't do because we want to reset that context
650 * for every tuple. So, build another context just like the other one...
667 indnkeyatts = indexRelation->
rd_index->indnkeyatts;
671 * The "name" type for btree uses text_ops which results in storing
672 * cstrings in the indexed keys rather than names. Here we detect that in
673 * a generic way in case other index AMs want to do the same optimization.
674 * Check for opclasses with an opcintype of NAMEOID and an index tuple
675 * descriptor with CSTRINGOID. If any of these are found, create an array
676 * marking the index attribute number of each of them. StoreIndexTuple()
677 * handles copying the name Datums into a NAMEDATALEN-byte allocation.
680 /* First, count the number of such index keys */
693 * Now create an array to mark the attribute numbers of the keys that
694 * need to be converted from cstring to name.
715/* ----------------------------------------------------------------
716 * Parallel Index-only Scan Support
717 * ----------------------------------------------------------------
720/* ----------------------------------------------------------------
721 * ExecIndexOnlyScanEstimate
723 * Compute the amount of space we'll need in the parallel
724 * query DSM, and inform pcxt->estimator about our needs.
725 * ----------------------------------------------------------------
735 if (!instrument && !parallel_aware)
737 /* No DSM required by the scan */
745 instrument, parallel_aware,
751/* ----------------------------------------------------------------
752 * ExecIndexOnlyScanInitializeDSM
754 * Set up a parallel index-only scan descriptor.
755 * ----------------------------------------------------------------
766 if (!instrument && !parallel_aware)
768 /* No DSM required by the scan */
776 instrument, parallel_aware, pcxt->
nworkers,
782 /* Only here to initialize SharedInfo in DSM */
797 * If no run-time keys to calculate or they are ready, go ahead and pass
798 * the scankeys to the index AM.
806/* ----------------------------------------------------------------
807 * ExecIndexOnlyScanReInitializeDSM
809 * Reset shared state before beginning a fresh scan.
810 * ----------------------------------------------------------------
820/* ----------------------------------------------------------------
821 * ExecIndexOnlyScanInitializeWorker
823 * Copy relevant information from TOC into planstate.
824 * ----------------------------------------------------------------
834 if (!instrument && !parallel_aware)
836 /* No DSM required by the scan */
848 /* Only here to set up worker node's SharedInfo */
862 * If no run-time keys to calculate or they are ready, go ahead and pass
863 * the scankeys to the index AM.
871/* ----------------------------------------------------------------
872 * ExecIndexOnlyScanRetrieveInstrumentation
874 * Transfer index-only scan statistics from DSM to private memory.
875 * ----------------------------------------------------------------
883 if (SharedInfo == NULL)
886 /* Create a copy of SharedInfo in backend-local memory */
Datum idx(PG_FUNCTION_ARGS)
void ReleaseBuffer(Buffer buffer)
#define OffsetToPointer(base, offset)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
void ExecReScan(PlanState *node)
ExprState * ExecInitQual(List *qual, PlanState *parent)
void ExecAssignScanProjectionInfoWithVarno(ScanState *node, int varno)
TupleTableSlot * ExecScan(ScanState *node, ExecScanAccessMtd accessMtd, ExecScanRecheckMtd recheckMtd)
void ExecScanReScan(ScanState *node)
const TupleTableSlotOps TTSOpsVirtual
TupleTableSlot * ExecStoreVirtualTuple(TupleTableSlot *slot)
TupleTableSlot * ExecAllocTableSlot(List **tupleTable, TupleDesc desc, const TupleTableSlotOps *tts_ops)
void ExecInitScanTupleSlot(EState *estate, ScanState *scanstate, TupleDesc tupledesc, const TupleTableSlotOps *tts_ops)
void ExecInitResultTypeTL(PlanState *planstate)
TupleDesc ExecTypeFromTL(List *targetList)
void ExecForceStoreHeapTuple(HeapTuple tuple, TupleTableSlot *slot, bool shouldFree)
void ExecAssignExprContext(EState *estate, PlanState *planstate)
Relation ExecOpenScanRelation(EState *estate, Index scanrelid, int eflags)
#define InstrCountTuples2(node, delta)
#define InstrCountFiltered2(node, delta)
static RangeTblEntry * exec_rt_fetch(Index rti, EState *estate)
#define ResetExprContext(econtext)
bool(* ExecScanRecheckMtd)(ScanState *node, TupleTableSlot *slot)
static bool ExecQualAndReset(ExprState *state, ExprContext *econtext)
TupleTableSlot *(* ExecScanAccessMtd)(ScanState *node)
#define EXEC_FLAG_EXPLAIN_ONLY
struct IndexScanInstrumentation IndexScanInstrumentation
Assert(PointerIsAligned(start, uint64))
#define IsParallelWorker()
void index_parallelscan_initialize(Relation heapRelation, Relation indexRelation, Snapshot snapshot, bool instrument, bool parallel_aware, int nworkers, SharedIndexScanInstrumentation **sharedinfo, ParallelIndexScanDesc target)
IndexScanDesc index_beginscan_parallel(Relation heaprel, Relation indexrel, IndexScanInstrumentation *instrument, int nkeys, int norderbys, ParallelIndexScanDesc pscan)
void index_restrpos(IndexScanDesc scan)
IndexScanDesc index_beginscan(Relation heapRelation, Relation indexRelation, Snapshot snapshot, IndexScanInstrumentation *instrument, int nkeys, int norderbys)
void index_close(Relation relation, LOCKMODE lockmode)
ItemPointer index_getnext_tid(IndexScanDesc scan, ScanDirection direction)
bool index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot)
void index_markpos(IndexScanDesc scan)
void index_endscan(IndexScanDesc scan)
Size index_parallelscan_estimate(Relation indexRelation, int nkeys, int norderbys, Snapshot snapshot, bool instrument, bool parallel_aware, int nworkers)
Relation index_open(Oid relationId, LOCKMODE lockmode)
void index_parallelrescan(IndexScanDesc scan)
void index_rescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int norderbys)
void index_deform_tuple(IndexTuple tup, TupleDesc tupleDescriptor, Datum *values, bool *isnull)
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
void * MemoryContextAlloc(MemoryContext context, Size size)
#define CHECK_FOR_INTERRUPTS()
void namestrcpy(Name name, const char *str)
void ExecEndIndexOnlyScan(IndexOnlyScanState *node)
static TupleTableSlot * IndexOnlyNext(IndexOnlyScanState *node)
static void StoreIndexTuple(IndexOnlyScanState *node, TupleTableSlot *slot, IndexTuple itup, TupleDesc itupdesc)
void ExecIndexOnlyScanEstimate(IndexOnlyScanState *node, ParallelContext *pcxt)
void ExecReScanIndexOnlyScan(IndexOnlyScanState *node)
void ExecIndexOnlyScanRetrieveInstrumentation(IndexOnlyScanState *node)
void ExecIndexOnlyRestrPos(IndexOnlyScanState *node)
void ExecIndexOnlyScanInitializeWorker(IndexOnlyScanState *node, ParallelWorkerContext *pwcxt)
static TupleTableSlot * ExecIndexOnlyScan(PlanState *pstate)
static bool IndexOnlyRecheck(IndexOnlyScanState *node, TupleTableSlot *slot)
IndexOnlyScanState * ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
void ExecIndexOnlyMarkPos(IndexOnlyScanState *node)
void ExecIndexOnlyScanReInitializeDSM(IndexOnlyScanState *node, ParallelContext *pcxt)
void ExecIndexOnlyScanInitializeDSM(IndexOnlyScanState *node, ParallelContext *pcxt)
void ExecIndexBuildScanKeys(PlanState *planstate, Relation index, List *quals, bool isorderby, ScanKey *scanKeys, int *numScanKeys, IndexRuntimeKeyInfo **runtimeKeys, int *numRuntimeKeys, IndexArrayKeyInfo **arrayKeys, int *numArrayKeys)
void ExecIndexEvalRuntimeKeys(ExprContext *econtext, IndexRuntimeKeyInfo *runtimeKeys, int numRuntimeKeys)
#define castNode(_type_, nodeptr)
static char * DatumGetCString(Datum X)
static Datum NameGetDatum(const NameData *X)
void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
#define RelationGetDescr(relation)
#define ScanDirectionCombine(a, b)
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
#define shm_toc_estimate_chunk(e, sz)
#define shm_toc_estimate_keys(e, cnt)
ExecAuxRowMark ** relsubs_rowmark
TupleTableSlot ** relsubs_slot
ScanDirection es_direction
struct EPQState * es_epq_active
MemoryContext ecxt_per_tuple_memory
TupleTableSlot * ecxt_scantuple
SharedIndexScanInstrumentation * ioss_SharedInfo
TupleTableSlot * ioss_TableSlot
bool ioss_RuntimeKeysReady
int ioss_NameCStringCount
struct IndexScanDescData * ioss_ScanDesc
ScanKeyData * ioss_OrderByKeys
ScanKeyData * ioss_ScanKeys
ExprContext * ioss_RuntimeContext
AttrNumber * ioss_NameCStringAttNums
Relation ioss_RelationDesc
IndexScanInstrumentation ioss_Instrument
IndexRuntimeKeyInfo * ioss_RuntimeKeys
struct TupleDescData * xs_hitupdesc
struct TupleDescData * xs_itupdesc
shm_toc_estimator estimator
Instrumentation * instrument
ExprContext * ps_ExprContext
ExecProcNodeMtd ExecProcNode
Relation ss_currentRelation
TupleTableSlot * ss_ScanTupleSlot
struct TableScanDescData * ss_currentScanDesc
IndexScanInstrumentation winstrument[FLEXIBLE_ARRAY_MEMBER]
TupleDesc tts_tupleDescriptor
const TupleTableSlotOps * table_slot_callbacks(Relation relation)
static FormData_pg_attribute * TupleDescAttr(TupleDesc tupdesc, int i)
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
#define VM_ALL_VISIBLE(r, b, v)