1/*-------------------------------------------------------------------------
4 * display visibility map information and page-level visibility bits
6 * Copyright (c) 2016-2025, PostgreSQL Global Development Group
8 * contrib/pg_visibility/pg_visibility.c
9 *-------------------------------------------------------------------------
29 .
name =
"pg_visibility",
47/* for collect_corrupt_items_read_stream_next_block */
77 * Visibility map information for a single block of a relation.
79 * Note: the VM code will silently return zeroes for pages past the end
80 * of the map, so we allow probes up to MaxBlockNumber regardless of the
81 * actual relation size.
97 /* Only some relkinds have a visibility map */
102 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
103 errmsg(
"invalid block number")));
119 * Visibility map information for a single block of a relation, plus the
120 * page-level information for the same block.
138 /* Only some relkinds have a visibility map */
143 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
144 errmsg(
"invalid block number")));
154 /* Here we have to explicitly check rel size ... */
167 /* As with the vismap, silently return 0 for pages past EOF */
177 * Visibility map information for every block in a relation.
193 /* collect_visibility_data will verify the relkind */
220 * Visibility map information for every block in a relation, plus the page
221 * level information for each block.
237 /* collect_visibility_data will verify the relkind */
265 * Count the number of all-visible and all-frozen pages in the visibility
266 * map for a particular relation.
276 int64 all_visible = 0;
277 int64 all_frozen = 0;
284 /* Only some relkinds have a visibility map */
289 for (blkno = 0; blkno < nblocks; ++blkno)
293 /* Make sure we are interruptible. */
310 elog(
ERROR,
"return type must be a row type");
319 * Return the TIDs of non-frozen tuples present in pages marked all-frozen
320 * in the visibility map. We hope no one will ever find any, but there could
321 * be bugs, database corruption, etc.
336 /* collect_corrupt_items will verify the relkind */
351 * Return the TIDs of not-all-visible tuples in pages marked all-visible
352 * in the visibility map. We hope no one will ever find any, but there could
353 * be bugs, database corruption, etc.
368 /* collect_corrupt_items will verify the relkind */
383 * Remove the visibility map fork for a relation. If there turn out to be
384 * any bugs in the visibility map code that require rebuilding the VM, this
385 * provides users with a way to do it that is cleaner than shutting down the
386 * server and removing files by hand.
388 * This is a cut-down version of RelationTruncate.
401 /* Only some relkinds have a visibility map */
404 /* Forcibly reset cached file size */
407 /* Compute new and old size before entering critical section. */
413 * WAL-logging, buffer dropping, file truncation must be atomic and all on
414 * one side of a checkpoint. See RelationTruncate() for discussion.
444 * Release the lock right away, not at commit time.
446 * It would be a problem to release the lock prior to commit if this
447 * truncate operation sends any transactional invalidation messages. Other
448 * backends would potentially be able to lock the relation without
449 * processing them in the window of time between when we release the lock
450 * here and when we sent the messages at our eventual commit. However,
451 * we're currently only sending a non-transactional smgr invalidation,
452 * which will have been posted to shared memory immediately from within
453 * smgr_truncate. Therefore, there should be no race here.
455 * The reason why it's desirable to release the lock early here is because
456 * of the possibility that someone will need to use this to blow away many
457 * visibility map forks at once. If we can't release the lock until
458 * commit time, the transaction doing this will accumulate
459 * AccessExclusiveLocks on all of those relations at the same time, which
460 * is undesirable. However, if this turns out to be unsafe we may have no
465 /* Nothing to return. */
470 * Helper function to construct whichever TupleDesc we need for a particular
497 * Collect visibility data about a relation.
499 * Checks relkind of relid and will throw an error if the relation does not
516 /* Only some relkinds have a visibility map */
522 info->
count = nblocks;
524 /* Create a stream if reading main fork. */
531 * It is safe to use batchmode as block_range_read_stream_cb takes no
544 for (blkno = 0; blkno < nblocks; ++blkno)
548 /* Make sure we are interruptible. */
554 info->
bits[blkno] |= (1 << 0);
556 info->
bits[blkno] |= (1 << 1);
559 * Page-level data requires reading every block, so only get it if the
560 * caller needs it. Use a buffer access strategy, too, to prevent
573 info->
bits[blkno] |= (1 << 2);
594 * The "strict" version of GetOldestNonRemovableTransactionId(). The
595 * pg_visibility check can tolerate false positives (don't report some of the
596 * errors), but can't tolerate false negatives (report false errors). Normally,
597 * horizons move forwards, but there are cases when it could move backward
598 * (see comment for ComputeXidHorizons()).
600 * This is why we have to implement our own function for xid horizon, which
601 * would be guaranteed to be newer or equal to any xid horizon computed before.
602 * We have to do the following to achieve this.
604 * 1. Ignore processes xmin's, because they consider connection to other
605 * databases that were ignored before.
606 * 2. Ignore KnownAssignedXids, as they are not database-aware. Although we
607 * now perform minimal checking on a standby by always using nextXid, this
608 * approach is better than nothing and will at least catch extremely broken
609 * cases where a xid is in the future.
610 * 3. Ignore walsender xmin, because it could go backward if some replication
611 * connections don't use replication slots.
613 * While it might seem like we could use KnownAssignedXids for shared
614 * catalogs, since shared catalogs rely on a global horizon rather than a
615 * database-specific one - there are potential edge cases. For example, a
616 * transaction may crash on the primary without writing a commit/abort record.
617 * This would lead to a situation where it appears to still be running on the
618 * standby, even though it has already ended on the primary. For this reason,
619 * it's safer to ignore KnownAssignedXids, even for shared catalogs.
621 * As a result, we're using only currently running xids to compute the horizon.
622 * Surely these would significantly sacrifice accuracy. But we have to do so
623 * to avoid reporting false errors.
634 /* As we ignore KnownAssignedXids on standby, just pick nextXid */
640 else if (rel == NULL || rel->
rd_rel->relisshared)
642 /* Shared relation: take into account all running xids */
651 * Normal relation: take into account xids running within the current
662 * For temporary relations, ComputeXidHorizons() uses only
663 * TransamVariables->latestCompletedXid and MyProc->xid. These two
664 * shouldn't go backwards. So we're fine with this horizon.
671 * Callback function to get next block for read stream object used in
672 * collect_corrupt_items() function.
676 void *callback_private_data,
677 void *per_buffer_data)
683 bool check_frozen =
false;
684 bool check_visible =
false;
686 /* Make sure we are interruptible. */
692 check_visible =
true;
693 if (!check_visible && !check_frozen)
703 * Returns a list of items whose visibility map information does not match
704 * the status of the tuples on the page.
706 * If all_visible is passed as true, this will include all items which are
707 * on pages marked as all-visible in the visibility map but which do not
708 * seem to in fact be all-visible.
710 * If all_frozen is passed as true, this will include all items which are
711 * on pages marked as all-frozen but which do not seem to in fact be frozen.
713 * Checks relkind of relid and will throw an error if the relation does not
730 /* Only some relkinds have a visibility map */
737 * Guess an initial array size. We don't expect many corrupted tuples, so
738 * start with a small array. This function uses the "next" field to track
739 * the next offset where we can store an item (which is the same thing as
740 * the number of items found so far) and the "count" field to track the
741 * number of entries allocated. We'll repurpose these fields before
763 /* Loop over every block in the relation. */
773 /* Make sure we are interruptible. */
783 * The visibility map bits might have changed while we were acquiring
784 * the page lock. Recheck to avoid returning spurious results.
787 check_frozen =
false;
789 check_visible =
false;
790 if (!check_visible && !check_frozen)
796 /* Iterate over each tuple on the page. */
806 /* Unused or redirect line pointers are of no interest. */
810 /* Dead line pointers are neither all-visible nor frozen. */
818 /* Initialize a HeapTupleData structure for checks below. */
825 * If we're checking whether the page is all-visible, we expect
826 * the tuple to be all-visible.
834 * Time has passed since we computed OldestXmin, so it's
835 * possible that this tuple is all-visible in reality even
836 * though it doesn't appear so based on our
837 * previously-computed value. Let's compute a new value so we
838 * can be certain whether there is a problem.
840 * From a concurrency point of view, it sort of sucks to
841 * retake ProcArrayLock here while we're holding the buffer
842 * exclusively locked, but it should be safe against
843 * deadlocks, because surely
844 * GetStrictOldestNonRemovableTransactionId() should never
845 * take a buffer lock. And this shouldn't happen often, so
846 * it's worth being careful so as to avoid false positives.
854 OldestXmin = RecomputedOldestXmin;
861 * If we're checking whether the page is all-frozen, we expect the
862 * tuple to be in a state where it will never need freezing.
883 * Before returning, repurpose the fields to match caller's expectations.
884 * next is now the next item that should be read (rather than written) and
885 * count is now the number of items we wrote (rather than the number we
895 * Remember one corrupt item.
900 /* enlarge output array if needed. */
907 /* and add the new item */
912 * Check whether a tuple is all-visible relative to a given OldestXmin value.
913 * The buffer should contain the tuple and should be locked and pinned.
923 return false;
/* all-visible implies live */
926 * Neither lazy_scan_heap nor heap_page_is_all_visible will mark a page
927 * all-visible unless every tuple is hinted committed. However, those hint
928 * bits could be lost after a crash, so we can't be certain that they'll
929 * be set here. So just check the xmin.
934 return false;
/* xmin not old enough for all to see */
940 * check_relation_relkind - convenience routine to check that relation
941 * is of the relkind supported by the callers
946 if (!RELKIND_HAS_TABLE_AM(
rel->
rd_rel->relkind))
948 (
errcode(ERRCODE_WRONG_OBJECT_TYPE),
949 errmsg(
"relation \"%s\" is of wrong relation kind",
#define InvalidBlockNumber
static bool BlockNumberIsValid(BlockNumber blockNumber)
static Datum values[MAXATTR]
BlockNumber BufferGetBlockNumber(Buffer buffer)
void ReleaseBuffer(Buffer buffer)
void UnlockReleaseBuffer(Buffer buffer)
void LockBuffer(Buffer buffer, int mode)
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
#define BUFFER_LOCK_SHARE
#define RelationGetNumberOfBlocks(reln)
static Page BufferGetPage(Buffer buffer)
static bool PageIsAllVisible(const PageData *page)
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
#define FLEXIBLE_ARRAY_MEMBER
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
TupleDesc BlessTupleDesc(TupleDesc tupdesc)
#define PG_GETARG_INT64(n)
#define PG_RETURN_DATUM(x)
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
TypeFuncClass get_call_result_type(FunctionCallInfo fcinfo, Oid *resultTypeId, TupleDesc *resultTupleDesc)
#define SRF_IS_FIRSTCALL()
#define SRF_PERCALL_SETUP()
#define SRF_RETURN_NEXT(_funcctx, _result)
#define SRF_FIRSTCALL_INIT()
static Datum HeapTupleGetDatum(const HeapTupleData *tuple)
#define SRF_RETURN_DONE(_funcctx)
Assert(PointerIsAligned(start, uint64))
bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer)
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
HeapTupleHeaderData * HeapTupleHeader
static TransactionId HeapTupleHeaderGetXmin(const HeapTupleHeaderData *tup)
if(TABLE==NULL||TABLE_index==NULL)
#define ItemIdGetLength(itemId)
#define ItemIdIsDead(itemId)
#define ItemIdIsUsed(itemId)
#define ItemIdIsRedirected(itemId)
static void ItemPointerSet(ItemPointerData *pointer, BlockNumber blockNumber, OffsetNumber offNum)
#define AccessExclusiveLock
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
void LWLockRelease(LWLock *lock)
void * repalloc(void *pointer, Size size)
void * palloc0(Size size)
#define START_CRIT_SECTION()
#define CHECK_FOR_INTERRUPTS()
#define END_CRIT_SECTION()
#define OffsetNumberNext(offsetNumber)
#define FirstOffsetNumber
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
int errdetail_relkind_not_supported(char relkind)
static BlockNumber collect_corrupt_items_read_stream_next_block(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
static corrupt_items * collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
Datum pg_visibility_map_summary(PG_FUNCTION_ARGS)
Datum pg_visibility_rel(PG_FUNCTION_ARGS)
PG_FUNCTION_INFO_V1(pg_visibility_map)
struct corrupt_items corrupt_items
static TupleDesc pg_visibility_tupdesc(bool include_blkno, bool include_pd)
static void check_relation_relkind(Relation rel)
static void record_corrupt_item(corrupt_items *items, ItemPointer tid)
static TransactionId GetStrictOldestNonRemovableTransactionId(Relation rel)
Datum pg_visibility_map(PG_FUNCTION_ARGS)
static vbits * collect_visibility_data(Oid relid, bool include_pd)
Datum pg_visibility_map_rel(PG_FUNCTION_ARGS)
PG_MODULE_MAGIC_EXT(.name="pg_visibility",.version=PG_VERSION)
Datum pg_check_visible(PG_FUNCTION_ARGS)
Datum pg_check_frozen(PG_FUNCTION_ARGS)
Datum pg_visibility(PG_FUNCTION_ARGS)
static bool tuple_all_visible(HeapTuple tup, TransactionId OldestXmin, Buffer buffer)
Datum pg_truncate_visibility_map(PG_FUNCTION_ARGS)
static Datum Int64GetDatum(int64 X)
static Datum PointerGetDatum(const void *X)
static Datum BoolGetDatum(bool X)
#define DELAY_CHKPT_START
#define DELAY_CHKPT_COMPLETE
TransactionId GetOldestNonRemovableTransactionId(Relation rel)
RunningTransactions GetRunningTransactionData(void)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_USE_BATCHING
#define RELATION_IS_LOCAL(relation)
static SMgrRelation RelationGetSmgr(Relation rel)
#define RelationGetRelationName(relation)
#define RelationNeedsWAL(relation)
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *old_nblocks, BlockNumber *nblocks)
void relation_close(Relation relation, LOCKMODE lockmode)
Relation relation_open(Oid relationId, LOCKMODE lockmode)
#define XLOG_SMGR_TRUNCATE
BlockNumber last_exclusive
BlockNumber current_blocknum
MemoryContext multi_call_memory_ctx
RelFileLocator rd_locator
TransactionId oldestRunningXid
TransactionId oldestDatabaseRunningXid
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
FullTransactionId nextXid
BlockNumber last_exclusive
BlockNumber current_blocknum
uint8 bits[FLEXIBLE_ARRAY_MEMBER]
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
#define InvalidTransactionId
#define XidFromFullTransactionId(x)
TupleDesc CreateTemplateTupleDesc(int natts)
void TupleDescInitEntry(TupleDesc desc, AttrNumber attributeNumber, const char *attributeName, Oid oidtypeid, int32 typmod, int attdim)
TransamVariablesData * TransamVariables
uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
BlockNumber visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
#define VM_ALL_VISIBLE(r, b, v)
#define VM_ALL_FROZEN(r, b, v)
#define VISIBILITYMAP_ALL_FROZEN
#define VISIBILITYMAP_ALL_VISIBLE
bool RecoveryInProgress(void)
void XLogFlush(XLogRecPtr record)
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
void XLogRegisterData(const void *data, uint32 len)
void XLogBeginInsert(void)
#define XLR_SPECIAL_REL_UPDATE