1/*-------------------------------------------------------------------------
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/spgist/spgvacuum.c
13 *-------------------------------------------------------------------------
32/* Entry in pending-list of TIDs we need to revisit */
36 bool done;
/* have we dealt with this? */
40/* Local state for vacuum operations */
43 /* Parameters passed in to spgvacuumscan */
49 /* Additional working state */
58 * Add TID to pendingList, but only if not already present.
60 * Note that new items are always appended at the end of the list; this
61 * ensures that scans of the list don't miss items added during the scan.
69 /* search the list for pre-existing entry */
71 while (*listLink != NULL)
75 return;
/* already in list, do nothing */
76 listLink = &pitem->
next;
78 /* not there, so append new entry */
95 for (pitem = bds->
pendingList; pitem != NULL; pitem = nitem)
98 /* All items in list should have been dealt with */
106 * Vacuum a regular (non-root) leaf page
108 * We must delete tuples that are targeted for deletion by the VACUUM,
109 * but not move any tuples that are referenced by outside links; we assume
110 * those are the ones that are heads of chains.
112 * If we find a REDIRECT that was made by a concurrently-running transaction,
113 * we must add its target TID to pendingList. (We don't try to visit the
114 * target immediately, first because we don't want VACUUM locking more than
115 * one buffer at a time, and second because the duplicate-filtering logic
116 * in spgAddPendingTID is useful to ensure we can't get caught in an infinite
117 * loop in the face of continuous concurrent insertions.)
119 * If forPending is true, we are examining the page as a consequence of
120 * chasing a redirect link, not as part of the normal sequential scan.
121 * We still vacuum the page normally, but we don't increment the stats
122 * about live tuples; else we'd double-count those tuples, since the page
123 * has been or will be visited in the sequential scan as well.
143 memset(predecessor, 0,
sizeof(predecessor));
144 memset(deletable, 0,
sizeof(deletable));
147 /* Scan page, identify tuples to delete, accumulate stats */
170 /* Form predecessor map, too */
173 /* paranoia about corrupted chain links */
177 elog(
ERROR,
"inconsistent tuple chain links in page %u of index \"%s\"",
191 * Add target TID to pending list if the redirection could have
192 * happened since VACUUM started. (If xid is invalid, assume it
193 * must have happened before VACUUM started, since REINDEX
194 * CONCURRENTLY locks out VACUUM.)
196 * Note: we could make a tighter test by seeing if the xid is
197 * "running" according to the active snapshot; but snapmgr.c
198 * doesn't currently export a suitable API, and it's not entirely
199 * clear that a tighter test is worth the cycles anyway.
211 return;
/* nothing more to do */
214 * Figure out exactly what we have to do. We do this separately from
215 * actually modifying the page, mainly so that we have a representation
216 * that can be dumped into WAL and then the replay code can do exactly
217 * the same thing. The output of this step consists of six arrays
218 * describing four kinds of operations, to be performed in this order:
220 * toDead[]: tuple numbers to be replaced with DEAD tuples
221 * toPlaceholder[]: tuple numbers to be replaced with PLACEHOLDER tuples
222 * moveSrc[]: tuple numbers that need to be relocated to another offset
223 * (replacing the tuple there) and then replaced with PLACEHOLDER tuples
224 * moveDest[]: new locations for moveSrc tuples
225 * chainSrc[]: tuple numbers whose chain links (nextOffset) need updates
226 * chainDest[]: new values of nextOffset for chainSrc members
228 * It's easiest to figure out what we have to do by processing tuple
229 * chains, so we iterate over all the tuples (not just the deletable
230 * ones!) to identify chain heads, then chase down each chain and make
231 * work item entries for deletable tuples within the chain.
239 bool interveningDeletable;
246 continue;
/* can't be a chain member */
247 if (predecessor[
i] != 0)
248 continue;
/* not a chain head */
251 interveningDeletable =
false;
254 /* scan down the chain ... */
264 /* all tuples in chain should be live */
265 elog(
ERROR,
"unexpected SPGiST tuple state: %d",
271 /* This tuple should be replaced by a placeholder */
274 /* previous live tuple's chain link will need an update */
275 interveningDeletable =
true;
280 * This is the first live tuple in the chain. It has to move
281 * to the head position.
284 moveDest[xlrec.
nMove] =
i;
286 /* Chain updates will be applied after the move */
288 interveningDeletable =
false;
293 * Second or later live tuple. Arrange to re-chain it to the
294 * previous live one, if there was a gap.
296 if (interveningDeletable)
298 chainSrc[xlrec.
nChain] = prevLive;
303 interveningDeletable =
false;
311 /* The chain is entirely removable, so we need a DEAD tuple */
315 else if (interveningDeletable)
317 /* One or more deletions at end of chain, so close it off */
318 chainSrc[xlrec.
nChain] = prevLive;
324 /* sanity check ... */
326 elog(
ERROR,
"inconsistent counts of deletable tuples");
342 * We implement the move step by swapping the line pointers of the source
343 * and target tuples, then replacing the newly-source tuples with
344 * placeholders. This is perhaps unduly friendly with the page data
345 * representation, but it's fast and doesn't risk page overflow when a
346 * tuple to be relocated is large.
360 moveSrc, xlrec.
nMove,
385 /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */
404 * Vacuum a root page when it is also a leaf
406 * On the root, we just delete any dead leaf tuples; no fancy business
419 /* Scan page, identify tuples to delete, accumulate stats */
443 /* all tuples on root should be live */
444 elog(
ERROR,
"unexpected SPGiST tuple state: %d",
450 return;
/* nothing more to do */
455 /* The tuple numbers are in order, so we can use PageIndexMultiDelete */
466 /* Prepare WAL record */
470 /* sizeof(xlrec) should be a multiple of sizeof(OffsetNumber) */
485 * Clean up redirect and placeholder tuples on the given page
487 * Redirect tuples can be marked placeholder once they're old enough.
488 * Placeholder tuples can be removed if it won't change the offsets of
489 * non-placeholder ones.
491 * Unlike the routines above, this works on both leaf and inner pages.
501 bool hasNonPlaceholder =
false;
502 bool hasUpdate =
false;
517 * Scan backwards to convert old redirection tuples to placeholder tuples,
518 * and identify location of last non-placeholder tuple while at it.
530 * We can convert a REDIRECT to a PLACEHOLDER if there could no longer
531 * be any index scans "in flight" to it. Such an index scan would
532 * have to be in a transaction whose snapshot sees the REDIRECT's XID
533 * as still running, so comparing the XID against global xmin is a
534 * conservatively safe test. If the XID is invalid, it must have been
535 * inserted by REINDEX CONCURRENTLY, so we can zap it immediately.
546 /* remember newest XID among the removed redirects */
561 if (!hasNonPlaceholder)
562 firstPlaceholder =
i;
566 hasNonPlaceholder =
true;
571 * Any placeholder tuples at the end of page can safely be removed. We
572 * can't remove ones before the last non-placeholder, though, because we
573 * can't alter the offset numbers of non-placeholder tuples.
578 * We do not store this array to rdata because it's easy to recreate.
580 for (
i = firstPlaceholder;
i <= max;
i++)
581 itemnos[
i - firstPlaceholder] =
i;
583 i = max - firstPlaceholder + 1;
587 /* The array is surely sorted, so can use PageIndexMultiDelete */
619 * Process one page during a bulkdelete scan
634 * We found an all-zero page, which could happen if the database
635 * crashed just after extending the file. Recycle it.
647 /* no need for vacuumRedirectAndPlaceholder */
662 * The root pages must never be deleted, nor marked as available in FSM,
663 * because we don't want them ever returned by a search for a place to put
664 * a new tuple. Otherwise, check for empty page, and make sure the FSM
685 * Process the pending-TID list between pages of the main scan
700 continue;
/* ignore already-done items */
702 /* call vacuum_delay_point while not holding any buffer lock */
705 /* examine the referenced page */
714 /* Probably shouldn't happen, but ignore it */
720 /* this should definitely not happen */
721 elog(
ERROR,
"redirection leads to root page of index \"%s\"",
725 /* deal with any deletable tuples */
727 /* might as well do this while we are here */
733 * We can mark as done not only this item, but any later ones
734 * pointing at the same page, since we vacuumed the whole page.
737 for (nitem = pitem->
next; nitem != NULL; nitem = nitem->
next)
746 * On an inner page, visit the referenced inner tuple and add all
747 * its downlinks to the pending list. We might have pending items
748 * for more than one inner tuple on the same page (in fact this is
749 * pretty likely given the way space allocation works), so get
750 * them all while we are here.
752 for (nitem = pitem; nitem != NULL; nitem = nitem->
next)
777 /* transfer attention to redirect point */
782 elog(
ERROR,
"unexpected SPGiST tuple state: %d",
797 * Perform a bulkdelete scan
808 /* Finish setting up spgBulkDeleteState */
815 * Reset counts that will be incremented during the scan; needed in case
816 * of multiple scans during a single VACUUM command
822 /* We can skip locking for new or temp relations */
827 * It is safe to use batchmode as block_range_read_stream_cb takes no
841 * The outer loop iterates over all index pages except the metapage, in
842 * physical order (we hope the kernel will cooperate in providing
843 * read-ahead for speed). It is critical that we visit all leaf pages,
844 * including ones added after we start the scan, else we might fail to
845 * delete some deletable tuples. See more extensive comments about this
850 /* Get the current relation length */
857 /* Quit if we've scanned the whole relation */
863 /* Iterate over pages, then loop back to recheck length */
868 /* call vacuum_delay_point while not holding any buffer lock */
878 /* empty the pending-list after each page */
884 * We have to reset the read stream to use it again. After returning
885 * InvalidBuffer, the read stream API won't invoke our callback again
886 * until the stream has been reset.
893 /* Propagate local lastUsedPages cache to metablock */
897 * If we found any empty pages (and recorded them in the FSM), then
898 * forcibly update the upper-level FSM pages to ensure that searchers can
899 * find them. It's possible that the pages were also found during
900 * previous scans and so this is a waste of time, but it's cheap enough
901 * relative to scanning the index that it shouldn't matter much, and
902 * making sure that free pages are available sooner not later seems
905 * Note that if no empty pages exist, we don't bother vacuuming the FSM at
912 * Truncate index if possible
914 * XXX disabled because it's unsafe due to possible concurrent inserts.
915 * We'd have to rescan the pages to make sure they're still empty, and it
916 * doesn't seem worth it. Note that btree doesn't do this either.
918 * Another reason not to truncate is that it could invalidate the cached
919 * pages-with-freespace pointers in the metapage and other backends'
920 * relation caches, that is leave them pointing to nonexistent pages.
921 * Adding RelationGetNumberOfBlocks calls to protect the places that use
922 * those pointers would be unduly expensive.
936 /* Report final stats */
943 * Bulk deletion of all index entries pointing to a set of heap tuples.
944 * The set of target tuples is specified via a callback routine that tells
945 * whether any given heap tuple (identified by ItemPointer) is being deleted.
947 * Result: a palloc'd struct containing statistical info for VACUUM displays.
955 /* allocate stats if first time through, else re-use existing struct */
968/* Dummy callback to delete no tuples during spgvacuumcleanup */
976 * Post-VACUUM cleanup.
978 * Result: a palloc'd struct containing statistical info for VACUUM displays.
985 /* No-op in ANALYZE ONLY mode */
990 * We don't need to scan the index if there was a preceding bulkdelete
991 * pass. Otherwise, make a pass that won't delete any live tuples, but
992 * might still accomplish useful stuff with redirect/placeholder cleanup
993 * and/or FSM housekeeping, and in any case will provide stats.
1007 * It's quite possible for us to be fooled by concurrent tuple moves into
1008 * double-counting some index tuples, so disbelieve any total that exceeds
1009 * the underlying heap's count ... if we know that accurately. Otherwise
1010 * this might just make matters worse.
#define InvalidBlockNumber
BlockNumber BufferGetBlockNumber(Buffer buffer)
void UnlockReleaseBuffer(Buffer buffer)
void MarkBufferDirty(Buffer buffer)
void LockBuffer(Buffer buffer, int mode)
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
#define RelationGetNumberOfBlocks(reln)
static Page BufferGetPage(Buffer buffer)
#define BUFFER_LOCK_EXCLUSIVE
static bool BufferIsValid(Buffer bufnum)
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
static bool PageIsEmpty(const PageData *page)
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
static bool PageIsNew(const PageData *page)
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
static void PageSetLSN(Page page, XLogRecPtr lsn)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)
Assert(PointerIsAligned(start, uint64))
void IndexFreeSpaceMapVacuum(Relation rel)
void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
static void ItemPointerSetInvalid(ItemPointerData *pointer)
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
static bool ItemPointerIsValid(const ItemPointerData *pointer)
#define MaxIndexTuplesPerPage
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
void pfree(void *pointer)
void * palloc0(Size size)
#define START_CRIT_SECTION()
#define END_CRIT_SECTION()
#define InvalidOffsetNumber
#define FirstOffsetNumber
bool GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid)
GlobalVisState * GlobalVisTestFor(Relation rel)
void read_stream_reset(ReadStream *stream)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_MAINTENANCE
#define READ_STREAM_USE_BATCHING
#define RELATION_IS_LOCAL(relation)
#define RelationGetRelationName(relation)
#define RelationIsAccessibleInLogicalDecoding(relation)
#define RelationNeedsWAL(relation)
Snapshot GetActiveSnapshot(void)
void spgPageIndexMultiDelete(SpGistState *state, Page page, OffsetNumber *itemnos, int nitems, int firststate, int reststate, BlockNumber blkno, OffsetNumber offnum)
SpGistDeadTupleData * SpGistDeadTuple
SpGistInnerTupleData * SpGistInnerTuple
#define SGLT_GET_NEXTOFFSET(spgLeafTuple)
#define SPGIST_PLACEHOLDER
#define SGITITERATE(x, i, nt)
#define SpGistPageIsLeaf(page)
#define SPGIST_METAPAGE_BLKNO
#define SpGistPageIsDeleted(page)
#define SGLT_SET_NEXTOFFSET(spgLeafTuple, offsetNumber)
#define SpGistBlockIsRoot(blkno)
struct SpGistLeafTupleData * SpGistLeafTuple
#define STORE_STATE(s, d)
#define SpGistPageGetOpaque(page)
#define SPGIST_LAST_FIXED_BLKNO
void initSpGistState(SpGistState *state, Relation index)
void SpGistUpdateMetaPage(Relation index)
void SpGistSetLastUsedPage(Relation index, Buffer buffer)
struct spgBulkDeleteState spgBulkDeleteState
static void vacuumRedirectAndPlaceholder(Relation index, Relation heaprel, Buffer buffer)
IndexBulkDeleteResult * spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
static bool dummy_callback(ItemPointer itemptr, void *state)
struct spgVacPendingItem spgVacPendingItem
static void vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, bool forPending)
IndexBulkDeleteResult * spgvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
static void spgvacuumscan(spgBulkDeleteState *bds)
static void spgprocesspending(spgBulkDeleteState *bds)
static void vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer)
static void spgClearPendingList(spgBulkDeleteState *bds)
static void spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid)
static void spgvacuumpage(spgBulkDeleteState *bds, Buffer buffer)
#define XLOG_SPGIST_VACUUM_ROOT
#define SizeOfSpgxlogVacuumLeaf
#define SizeOfSpgxlogVacuumRedirect
#define XLOG_SPGIST_VACUUM_LEAF
#define XLOG_SPGIST_VACUUM_REDIRECT
#define SizeOfSpgxlogVacuumRoot
void RelationTruncate(Relation rel, BlockNumber nblocks)
BlockNumber last_exclusive
BlockNumber current_blocknum
BlockNumber pages_deleted
BlockNumber pages_newly_deleted
BufferAccessStrategy strategy
IndexBulkDeleteResult * stats
IndexBulkDeleteCallback callback
spgVacPendingItem * pendingList
BlockNumber lastFilledBlock
struct spgVacPendingItem * next
OffsetNumber firstPlaceholder
TransactionId snapshotConflictHorizon
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
#define InvalidTransactionId
#define TransactionIdIsValid(xid)
void vacuum_delay_point(bool is_analyze)
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
void XLogRegisterData(const void *data, uint32 len)
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
void XLogBeginInsert(void)