1/*-------------------------------------------------------------------------
4 * bitmap for tracking visibility of heap tuples
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/heap/visibilitymap.c
14 * visibilitymap_clear - clear bits for one page in the visibility map
15 * visibilitymap_pin - pin a map page for setting a bit
16 * visibilitymap_pin_ok - check whether correct map page is already pinned
17 * visibilitymap_set - set a bit in a previously pinned page
18 * visibilitymap_get_status - get status of bits
19 * visibilitymap_count - count number of bits set in visibility map
20 * visibilitymap_prepare_truncate -
21 * prepare for truncation of the visibility map
25 * The visibility map is a bitmap with two bits (all-visible and all-frozen)
26 * per heap page. A set all-visible bit means that all tuples on the page are
27 * known visible to all transactions, and therefore the page doesn't need to
28 * be vacuumed. A set all-frozen bit means that all tuples on the page are
29 * completely frozen, and therefore the page doesn't need to be vacuumed even
30 * if whole table scanning vacuum is required (e.g. anti-wraparound vacuum).
31 * The all-frozen bit must be set only when the page is already all-visible.
33 * The map is conservative in the sense that we make sure that whenever a bit
34 * is set, we know the condition is true, but if a bit is not set, it might or
37 * Clearing visibility map bits is not separately WAL-logged. The callers
38 * must make sure that whenever a bit is cleared, the bit is cleared on WAL
39 * replay of the updating operation as well.
41 * When we *set* a visibility map during VACUUM, we must write WAL. This may
42 * seem counterintuitive, since the bit is basically a hint: if it is clear,
43 * it may still be the case that every tuple on the page is visible to all
44 * transactions; we just don't know that for certain. The difficulty is that
45 * there are two bits which are typically set together: the PD_ALL_VISIBLE bit
46 * on the page itself, and the visibility map bit. If a crash occurs after the
47 * visibility map page makes it to disk and before the updated heap page makes
48 * it to disk, redo must set the bit on the heap page. Otherwise, the next
49 * insert, update, or delete on the heap page will fail to realize that the
50 * visibility map bit must be cleared, possibly causing index-only scans to
51 * return wrong answers.
53 * VACUUM will normally skip pages for which the visibility map bit is set;
54 * such pages can't contain any dead tuples and therefore don't need vacuuming.
58 * In heapam.c, whenever a page is modified so that not all tuples on the
59 * page are visible to everyone anymore, the corresponding bit in the
60 * visibility map is cleared. In order to be crash-safe, we need to do this
61 * while still holding a lock on the heap page and in the same critical
62 * section that logs the page modification. However, we don't want to hold
63 * the buffer lock over any I/O that may be required to read in the visibility
64 * map page. To avoid this, we examine the heap page before locking it;
65 * if the page-level PD_ALL_VISIBLE bit is set, we pin the visibility map
66 * bit. Then, we lock the buffer. But this creates a race condition: there
67 * is a possibility that in the time it takes to lock the buffer, the
68 * PD_ALL_VISIBLE bit gets set. If that happens, we have to unlock the
69 * buffer, pin the visibility map page, and relock the buffer. This shouldn't
70 * happen often, because only VACUUM currently sets visibility map bits,
71 * and the race will only occur if VACUUM processes a given page at almost
72 * exactly the same time that someone tries to further modify it.
74 * To set a bit, you need to hold a lock on the heap page. That prevents
75 * the race condition where VACUUM sees that all tuples on the page are
76 * visible to everyone, but another backend modifies the page before VACUUM
77 * sets the bit in the visibility map.
79 * When a bit is set, the LSN of the visibility map page is updated to make
80 * sure that the visibility map update doesn't get written to disk before the
81 * WAL record of the changes that made it possible to set the bit is flushed.
82 * But when a bit is cleared, we don't have to do that because it's always
83 * safe to clear a bit in the map from correctness point of view.
85 *-------------------------------------------------------------------------
101/*#define TRACE_VISIBILITYMAP */
104 * Size of the bitmap on each visibility map page, in bytes. There's no
105 * extra headers, so the whole page minus the standard page header is
106 * used for the bitmap.
108 #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
110/* Number of heap blocks we can represent in one byte */
111 #define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
113/* Number of heap blocks we can represent in one visibility map page. */
114 #define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
116/* Mapping from heap block number to the right bit in the visibility map */
117 #define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
118 #define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
119 #define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK)
121/* Masks for counting subsets of bits in the visibility map. */
122 #define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */
123 #define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */
125/* prototypes for internal routines */
131 * visibilitymap_clear - clear specified bits for one page in visibility map
133 * You must pass a buffer containing the correct map page to this function.
134 * Call visibilitymap_pin first to pin the right one. This function doesn't do
135 * any I/O. Returns true if any bits have been cleared and false otherwise.
143 uint8 mask = flags << mapOffset;
145 bool cleared =
false;
147 /* Must never clear all_visible bit while leaving all_frozen bit set */
151#ifdef TRACE_VISIBILITYMAP
156 elog(
ERROR,
"wrong buffer passed to visibilitymap_clear");
161 if (map[mapByte] & mask)
163 map[mapByte] &= ~mask;
175 * visibilitymap_pin - pin a map page for setting a bit
177 * Setting a bit in the visibility map is a two-phase operation. First, call
178 * visibilitymap_pin, to pin the visibility map page containing the bit for
179 * the heap page. Because that can require I/O to read the map page, you
180 * shouldn't hold a lock on the heap page while doing that. Then, call
181 * visibilitymap_set to actually set the bit.
183 * On entry, *vmbuf should be InvalidBuffer or a valid buffer returned by
184 * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
185 * relation. On return, *vmbuf is a valid buffer with the map page containing
186 * the bit for heapBlk.
188 * If the page doesn't exist in the map file yet, it is extended.
195 /* Reuse the old pinned buffer if possible */
207 * visibilitymap_pin_ok - do we already have the correct page pinned?
209 * On entry, vmbuf should be InvalidBuffer or a valid buffer returned by
210 * an earlier call to visibilitymap_pin or visibilitymap_get_status on the same
211 * relation. The return value indicates whether the buffer covers the
223 * visibilitymap_set - set bit(s) on a previously pinned page
225 * recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
226 * or InvalidXLogRecPtr in normal running. The VM page LSN is advanced to the
227 * one provided; in normal running, we generate a new XLOG record and set the
228 * page LSN to that value (though the heap page's LSN may *not* be updated;
229 * see below). cutoff_xid is the largest xmin on the page being marked
230 * all-visible; it is needed for Hot Standby, and can be InvalidTransactionId
231 * if the page contains no tuples. It can also be set to InvalidTransactionId
232 * when a page that is already all-visible is being marked all-frozen.
234 * Caller is expected to set the heap page's PD_ALL_VISIBLE bit before calling
235 * this function. Except in recovery, caller should also pass the heap
236 * buffer. When checksums are enabled and we're not in recovery, we must add
237 * the heap buffer to the WAL chain to protect it from being torn.
239 * You must pass a buffer containing the correct map page to this function.
240 * Call visibilitymap_pin first to pin the right one. This function doesn't do
243 * Returns the state of the page's VM bits before setting flags.
257#ifdef TRACE_VISIBILITYMAP
266 /* Must never set all_frozen bit without also setting all_visible bit */
269 /* Check that we have the right heap page pinned, if present */
271 elog(
ERROR,
"wrong heap buffer passed to visibilitymap_set");
275 /* Check that we have the right VM page pinned */
277 elog(
ERROR,
"wrong VM buffer passed to visibilitymap_set");
288 map[mapByte] |= (flags << mapOffset);
299 * If data checksums are enabled (or wal_log_hints=on), we
300 * need to protect the heap page from being torn.
302 * If not, then we must *not* update the heap page's LSN. In
303 * this case, the FPI for the heap page was omitted from the
304 * WAL record inserted above, so it would be incorrect to
305 * update the heap page's LSN.
325 * visibilitymap_get_status - get status of bits
327 * Are all tuples on heapBlk visible to all or are marked frozen, according
328 * to the visibility map?
330 * On entry, *vmbuf should be InvalidBuffer or a valid buffer returned by an
331 * earlier call to visibilitymap_pin or visibilitymap_get_status on the same
332 * relation. On return, *vmbuf is a valid buffer with the map page containing
333 * the bit for heapBlk, or InvalidBuffer. The caller is responsible for
334 * releasing *vmbuf after it's done testing and setting bits.
336 * NOTE: This function is typically called without a lock on the heap page,
337 * so somebody else could change the bit just after we look at it. In fact,
338 * since we don't lock the visibility map page either, it's even possible that
339 * someone else could have changed the bit just before we look at it, but yet
340 * we might see the old value. It is the caller's responsibility to deal with
341 * all concurrency issues!
352#ifdef TRACE_VISIBILITYMAP
356 /* Reuse the old pinned buffer if possible */
376 * A single byte read is atomic. There could be memory-ordering effects
377 * here, but for performance reasons we make it the caller's job to worry
385 * visibilitymap_count - count number of bits set in visibility map
387 * Note: we ignore the possibility of race conditions when the table is being
388 * extended concurrently with the call. New pages added to the table aren't
389 * going to be marked all-visible or all-frozen, so they won't affect the result.
398 /* all_visible must be specified */
401 for (mapBlock = 0;; mapBlock++)
407 * Read till we fall off the end of the map. We assume that any extra
408 * bytes in the last page are zeroed, so we don't bother excluding
409 * them from the count.
416 * We choose not to lock the page, since the result is going to be
417 * immediately stale anyway if anyone is concurrently setting or
418 * clearing bits, and we only really need an approximate value.
429 *all_visible = nvisible;
431 *all_frozen = nfrozen;
435 * visibilitymap_prepare_truncate -
436 * prepare for truncation of the visibility map
438 * nheapblocks is the new size of the heap.
440 * Return the number of blocks of new visibility map.
441 * If it's InvalidBlockNumber, there is nothing to truncate;
442 * otherwise the caller is responsible for calling smgrtruncate()
443 * to truncate the visibility map pages.
450 /* last remaining block, byte, and bit */
455#ifdef TRACE_VISIBILITYMAP
460 * If no visibility map has been created yet for this relation, there's
461 * nothing to truncate.
467 * Unless the new size is exactly at a visibility map page boundary, the
468 * tail bits in the last remaining map page, representing truncated heap
469 * blocks, need to be cleared. This is not only tidy, but also necessary
470 * because we don't get a chance to clear the bits if the heap is extended
473 if (truncByte != 0 || truncOffset != 0)
479 newnblocks = truncBlock + 1;
481 mapBuffer =
vm_readbuf(rel, truncBlock,
false);
484 /* nothing to do, the file was already smaller */
493 /* NO EREPORT(ERROR) from here till changes are logged */
496 /* Clear out the unwanted bytes. */
500 * Mask out the unwanted bits of the last remaining byte.
502 * ((1 << 0) - 1) = 00000000
503 * ((1 << 1) - 1) = 00000001
505 * ((1 << 6) - 1) = 00111111
506 * ((1 << 7) - 1) = 01111111
509 map[truncByte] &= (1 << truncOffset) - 1;
512 * Truncation of a relation is WAL-logged at a higher-level, and we
513 * will be called at WAL replay. But if checksums are enabled, we need
514 * to still write a WAL record to protect against a torn page, if the
515 * page is flushed to disk before the truncation WAL record. We cannot
516 * use MarkBufferDirtyHint here, because that will not dirty the page
528 newnblocks = truncBlock;
532 /* nothing to do, the file was already smaller than requested size */
540 * Read a visibility map page.
542 * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
543 * true, the visibility map file is extended.
552 * Caution: re-using this smgr pointer could fail if the relcache entry
553 * gets closed. It's safe as long as we only do smgr-level operations
554 * between here and the last use of the pointer.
559 * If we haven't cached the size of the visibility map fork yet, check it
571 * For reading we use ZERO_ON_ERROR mode, and initialize the page if
572 * necessary. It's always safe to clear bits, so it's better to clear
573 * corrupt pages than error out.
575 * We use the same path below to initialize pages when extending the
576 * relation, as a concurrent extension can end up with vm_extend()
577 * returning an already-initialized page.
591 * Initializing the page when needed is trickier than it looks, because of
592 * the possibility of multiple backends doing this concurrently, and our
593 * desire to not uselessly take the buffer lock in the normal path where
594 * the page is OK. We must take the lock to initialize the page, so
595 * recheck page newness after we have the lock, in case someone else
596 * already did it. Also, because we initially check PageIsNew with no
597 * lock, it's possible to fall through and return the buffer while someone
598 * else is still initializing the page (i.e., we might see pd_upper as set
599 * but other page header fields are still zeroes). This is harmless for
600 * callers that will take a buffer lock themselves, but some callers
601 * inspect the page without any lock at all. The latter is OK only so
602 * long as it doesn't depend on the page header having correct contents.
603 * Current usage is safe because PageGetContents() does not require that.
616 * Ensure that the visibility map fork is at least vm_nblocks long, extending
617 * it if necessary with zeroed pages.
631 * Send a shared-inval message to force other backends to close any smgr
632 * references they may have for this rel, which we are about to change.
633 * This is a useful optimization because it means that backends don't have
634 * to keep checking for creation or extension of the file, which happens
#define InvalidBlockNumber
bool BufferIsExclusiveLocked(Buffer buffer)
BlockNumber BufferGetBlockNumber(Buffer buffer)
Buffer ExtendBufferedRelTo(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
void ReleaseBuffer(Buffer buffer)
void UnlockReleaseBuffer(Buffer buffer)
void MarkBufferDirty(Buffer buffer)
void LockBuffer(Buffer buffer, int mode)
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
#define BUFFER_LOCK_UNLOCK
static Page BufferGetPage(Buffer buffer)
@ EB_CREATE_FORK_IF_NEEDED
#define BUFFER_LOCK_EXCLUSIVE
static bool BufferIsValid(Buffer bufnum)
void PageInit(Page page, Size pageSize, Size specialSize)
static bool PageIsAllVisible(const PageData *page)
static bool PageIsNew(const PageData *page)
static char * PageGetContents(Page page)
static void PageSetLSN(Page page, XLogRecPtr lsn)
#define MemSet(start, val, len)
Assert(PointerIsAligned(start, uint64))
XLogRecPtr log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags)
void CacheInvalidateSmgr(RelFileLocatorBackend rlocator)
#define START_CRIT_SECTION()
#define END_CRIT_SECTION()
static uint64 pg_popcount_masked(const char *buf, int bytes, bits8 mask)
static SMgrRelation RelationGetSmgr(Relation rel)
#define RelationGetRelationName(relation)
#define RelationNeedsWAL(relation)
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf)
bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
#define HEAPBLK_TO_OFFSET(x)
void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
static Buffer vm_extend(Relation rel, BlockNumber vm_nblocks)
BlockNumber visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
void visibilitymap_count(Relation rel, BlockNumber *all_visible, BlockNumber *all_frozen)
static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
uint8 visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, uint8 flags)
#define HEAPBLK_TO_MAPBLOCK(x)
#define HEAPBLK_TO_MAPBYTE(x)
#define VISIBILITYMAP_VALID_BITS
#define VISIBILITYMAP_ALL_FROZEN
#define VISIBILITYMAP_ALL_VISIBLE
#define XLogHintBitIsNeeded()
#define XLogRecPtrIsInvalid(r)
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)