1/*-------------------------------------------------------------------------
4 * WAL replay logic for heap access method.
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/heap/heapam_xlog.c
13 *-------------------------------------------------------------------------
27 * Replay XLOG_HEAP2_PRUNE_* records.
45 * We will take an ordinary exclusive lock or a cleanup lock depending on
46 * whether the XLHP_CLEANUP_LOCK flag is set. With an ordinary exclusive
47 * lock, we better not be doing anything that requires moving existing
54 * We are about to remove and/or freeze tuples. In Hot Standby mode,
55 * ensure that there are no queries running for which the removed tuples
56 * are still visible or which still consider the frozen xids as running.
57 * The conflict horizon XID comes after xl_heap_prune.
63 /* memcpy() because snapshot_conflict_horizon is stored unaligned */
64 memcpy(&snapshot_conflict_horizon, maindataptr,
sizeof(
TransactionId));
74 * If we have a full-page image, restore it and we're done.
95 &nplans, &plans, &frz_offsets,
96 &nredirected, &redirected,
98 &nunused, &nowunused);
101 * Update all line pointers per the record, and repair fragmentation
104 if (nredirected > 0 || ndead > 0 || nunused > 0)
107 redirected, nredirected,
112 for (
int p = 0; p < nplans; p++)
117 * Convert freeze plan representation from WAL record into
118 * per-tuple format used by heap_execute_freeze_tuple
138 /* There should be no more data */
139 Assert((
char *) frz_offsets == dataptr + datalen);
142 * Note: we don't worry about updating the page's prunability hints.
143 * At worst this will cause an extra prune cycle to occur soon.
151 * If we released any space or line pointers, update the free space map.
153 * Do this regardless of a full-page image being applied, since the FSM
154 * data is not in the page anyway.
174 * Replay XLOG_HEAP2_VISIBLE records.
176 * The critical integrity requirement here is that we must never end up with
177 * a situation where the visibility map bit is set, and the page-level
178 * PD_ALL_VISIBLE bit is clear. If that were to occur, then a subsequent
179 * page modification would fail to clear the visibility map bit.
198 * If there are any Hot Standby transactions running that have an xmin
199 * horizon old enough that this page isn't all-visible for them, they
200 * might incorrectly decide that an index-only scan can skip a heap fetch.
202 * NB: It might be better to throw some kind of "soft" conflict here that
203 * forces any index-only scan that is in flight to perform heap fetches,
204 * rather than killing the transaction outright.
212 * Read the heap page, if it still exists. If the heap file has dropped or
213 * truncated later in recovery, we don't need to update the page, but we'd
214 * better still update the visibility map.
220 * We don't bump the LSN of the heap page when setting the visibility
221 * map bit (unless checksums or wal_hint_bits is enabled, in which
222 * case we must). This exposes us to torn page hazards, but since
223 * we're not inspecting the existing page contents in any way, we
238 * If heap block was backed up, we already restored it and there's
239 * nothing more to do. (This can only happen with checksums or
240 * wal_log_hints enabled.)
251 * Since FSM is not WAL-logged and only updated heuristically, it
252 * easily becomes stale in standbys. If the standby is later promoted
253 * and runs VACUUM, it will skip updating individual free space
254 * figures for pages that became all-visible (or all-frozen, depending
255 * on the vacuum mode,) which is troublesome when FreeSpaceMapVacuum
256 * propagates too optimistic free space values to upper FSM layers;
257 * later inserters try to use such pages only to find out that they
258 * are unusable. This can cause long stalls when there are many such
261 * Forestall those problems by updating FSM's idea about a page that
262 * is becoming all-visible or all-frozen.
264 * Do this regardless of a full-page image being applied, since the
265 * FSM data is not in the page anyway.
272 * Even if we skipped the heap page update due to the LSN interlock, it's
273 * still safe to update the visibility map. Any WAL record that clears
274 * the visibility map bit does so before checking the page LSN, so any
275 * bits that need to be cleared will still be cleared.
284 /* initialize the page if it was read as zeros */
288 /* remove VISIBILITYMAP_XLOG_* */
292 * XLogReadBufferForRedoExtended locked the buffer. But
293 * visibilitymap_set will handle locking itself.
310 * Given an "infobits" field from an XLog record, set the correct bits in the
311 * given infomask and infomask2 for the tuple touched by the record.
313 * (This is the reverse of compute_infobits).
320 *infomask2 &= ~HEAP_KEYS_UPDATED;
328 /* note HEAP_XMAX_SHR_LOCK isn't considered here */
337 * Replay XLOG_HEAP_DELETE records.
357 * The visibility map may need to be fixed even if the heap page is
358 * already up-to-date.
394 /* Mark the page as a candidate for pruning */
400 /* Make sure t_ctid is set correctly */
404 htup->
t_ctid = target_tid;
413 * Replay XLOG_HEAP_INSERT records.
440 /* No freezing in the heap_insert() code path */
444 * The visibility map may need to be fixed even if the heap page is
445 * already up-to-date.
459 * If we inserted the first and only tuple on the page, re-initialize the
479 elog(
PANIC,
"invalid max offset number");
490 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
500 htup->
t_ctid = target_tid;
519 * If the page is running low on free space, update the FSM as well.
520 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
521 * better than that without knowing the fill-factor for the table.
523 * XXX: Don't do this if the page was restored from full page image. We
524 * don't bother to update the FSM in that case, it doesn't need to be
525 * totally accurate anyway.
532 * Replay XLOG_HEAP2_MULTI_INSERT records.
556 * Insertion doesn't overwrite MVCC data, so no conflict processing is
563 /* check that the mutually exclusive flags are not both set */
568 * The visibility map may need to be fixed even if the heap page is
569 * already up-to-date.
597 /* Tuples are stored as block data */
599 endptr = tupdata +
len;
609 * If we're reinitializing the page, the tuples are stored in
610 * order from FirstOffsetNumber. Otherwise there's an array of
611 * offsets in the WAL record, and the tuples come after that.
618 elog(
PANIC,
"invalid max offset number");
627 /* PG73FORMAT: get bitmap [+ padding] [+ oid] + data */
646 if (tupdata != endptr)
647 elog(
PANIC,
"total tuple length mismatch");
656 /* XLH_INSERT_ALL_FROZEN_SET implies that all tuples are visible */
666 * If the page is running low on free space, update the FSM as well.
667 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
668 * better than that without knowing the fill-factor for the table.
670 * XXX: Don't do this if the page was restored from full page image. We
671 * don't bother to update the FSM in that case, it doesn't need to be
672 * totally accurate anyway.
679 * Replay XLOG_HEAP_UPDATE and XLOG_HEAP_HOT_UPDATE records.
711 /* initialize to keep the compiler quiet */
718 /* HOT updates are never done across pages */
727 * The visibility map may need to be fixed even if the heap page is
728 * already up-to-date.
742 * In normal operation, it is important to lock the two pages in
743 * page-number order, to avoid possible deadlocks against other update
744 * operations going the other way. However, during WAL replay there can
745 * be no other update happening, so we don't need to worry about that. But
746 * we *do* need to worry that we don't expose an inconsistent state to Hot
747 * Standby queries --- so the original page can't be unlocked before we've
748 * added the new tuple to the new page.
751 /* Deal with old tuple version */
779 /* Set forward chain link in t_ctid */
782 /* Mark the page as a candidate for pruning */
793 * Read the page the new tuple goes into, if different from old.
795 if (oldblk == newblk)
798 newaction = oldaction;
811 * The visibility map may need to be fixed even if the heap page is
812 * already up-to-date.
825 /* Deal with new tuple */
834 recdata_end = recdata + datalen;
840 elog(
PANIC,
"invalid max offset number");
845 memcpy(&prefixlen, recdata,
sizeof(
uint16));
846 recdata +=
sizeof(
uint16);
851 memcpy(&suffixlen, recdata,
sizeof(
uint16));
852 recdata +=
sizeof(
uint16);
858 tuplen = recdata_end - recdata;
865 * Reconstruct the new tuple using the prefix and/or suffix from the
866 * old tuple, and the data stored in the WAL record.
873 /* copy bitmap [+ padding] [+ oid] from WAL record */
875 memcpy(newp, recdata,
len);
879 /* copy prefix from old tuple */
883 /* copy new tuple data from WAL record */
885 memcpy(newp, recdata,
len);
892 * copy bitmap [+ padding] [+ oid] + data from record, all in one
895 memcpy(newp, recdata, tuplen);
899 Assert(recdata == recdata_end);
901 /* copy suffix from old tuple */
903 memcpy(newp, (
char *) oldtup.
t_data + oldtup.
t_len - suffixlen, suffixlen);
913 /* Make sure there is no forward chain link in t_ctid */
935 * If the new page is running low on free space, update the FSM as well.
936 * Arbitrarily, our definition of "low" is less than 20%. We can't do much
937 * better than that without knowing the fill-factor for the table.
939 * However, don't update the FSM on HOT updates, because after crash
940 * recovery, either the old or the new tuple will certainly be dead and
941 * prunable. After pruning, the page will have roughly as much free space
942 * as it did before the update, assuming the new tuple is about the same
943 * size as the old one.
945 * XXX: Don't do this if the page was restored from full page image. We
946 * don't bother to update the FSM in that case, it doesn't need to be
947 * totally accurate anyway.
949 if (newaction ==
BLK_NEEDS_REDO && !hot_update && freespace < BLCKSZ / 5)
954 * Replay XLOG_HEAP_CONFIRM records.
981 * Confirm tuple as actually inserted
993 * Replay XLOG_HEAP_LOCK records.
1007 * The visibility map may need to be fixed even if the heap page is
1008 * already up-to-date.
1046 * Clear relevant update flags, but only if the modified infomask says
1047 * there's no update.
1052 /* Make sure there is no forward chain link in t_ctid */
1067 * Replay XLOG_HEAP2_LOCK_UPDATED records.
1083 * The visibility map may need to be fixed even if the heap page is
1084 * already up-to-date.
1130 * Replay XLOG_HEAP_INPLACE records.
1161 if (oldlen != newlen)
1164 memcpy((
char *) htup + htup->
t_hoff, newtup, newlen);
1185 * These operations don't overwrite MVCC data so no conflict processing is
1186 * required. The ones in heap2 rmgr do.
1203 * TRUNCATE is a no-op because the actions are already logged as
1204 * SMGR WAL records. TRUNCATE WAL record only exists for logical
1221 elog(
PANIC,
"heap_redo: unknown op code %u", info);
1249 * Nothing to do on a real replay, only used during logical
1257 elog(
PANIC,
"heap2_redo: unknown op code %u", info);
1262 * Mask a heap page before performing consistency checks on it.
1287 * If xmin of a tuple is not yet frozen, we should ignore
1288 * differences in hint bits, since they can be set without
1295 /* Still we need to mask xmax hint bits. */
1301 * During replay, we set Command Id to FirstCommandId. Hence, mask
1302 * it. See heap_xlog_insert() for details.
1304 page_htup->t_choice.t_heap.t_field3.t_cid =
MASK_MARKER;
1307 * For a speculative tuple, heap_insert() does not set ctid in the
1308 * caller-passed heap tuple itself, leaving the ctid field to
1309 * contain a speculative token value - a per-backend monotonically
1310 * increasing identifier. Besides, it does not WAL-log ctid under
1311 * any circumstances.
1313 * During redo, heap_xlog_insert() sets t_ctid to current block
1314 * number and self offset number. It doesn't care about any
1315 * speculative insertions on the primary. Hence, we set t_ctid to
1316 * current block number and self offset number to ignore any
1323 * NB: Not ignoring ctid changes due to the tuple having moved
1324 * (i.e. HeapTupleHeaderIndicatesMovedPartitions), because that's
1325 * important information that needs to be in-sync between primary
1326 * and standby, and thus is WAL logged.
1331 * Ignore any padding bytes after the tuple, when the length of the
1332 * item is not MAXALIGNed.
void mask_page_lsn_and_checksum(Page page)
void mask_unused_space(Page page)
void mask_page_hint_bits(Page page)
BlockNumber BufferGetBlockNumber(Buffer buffer)
void ReleaseBuffer(Buffer buffer)
void UnlockReleaseBuffer(Buffer buffer)
void MarkBufferDirty(Buffer buffer)
void LockBuffer(Buffer buffer, int mode)
#define BUFFER_LOCK_UNLOCK
static Page BufferGetPage(Buffer buffer)
static Size BufferGetPageSize(Buffer buffer)
static bool BufferIsValid(Buffer bufnum)
Size PageGetFreeSpace(const PageData *page)
Size PageGetHeapFreeSpace(const PageData *page)
void PageInit(Page page, Size pageSize, Size specialSize)
static void PageClearAllVisible(Page page)
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
static bool PageIsNew(const PageData *page)
static void PageSetAllVisible(Page page)
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
static void PageSetLSN(Page page, XLogRecPtr lsn)
#define PageSetPrunable(page, xid)
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
#define MemSet(start, val, len)
void XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk, Size spaceAvail)
Assert(PointerIsAligned(start, uint64))
static void heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz)
void heap_redo(XLogReaderState *record)
static void heap_xlog_prune_freeze(XLogReaderState *record)
void heap_mask(char *pagedata, BlockNumber blkno)
static void heap_xlog_insert(XLogReaderState *record)
static void fix_infomask_from_infobits(uint8 infobits, uint16 *infomask, uint16 *infomask2)
static void heap_xlog_update(XLogReaderState *record, bool hot_update)
static void heap_xlog_delete(XLogReaderState *record)
static void heap_xlog_lock_updated(XLogReaderState *record)
static void heap_xlog_lock(XLogReaderState *record)
static void heap_xlog_multi_insert(XLogReaderState *record)
static void heap_xlog_visible(XLogReaderState *record)
static void heap_xlog_inplace(XLogReaderState *record)
static void heap_xlog_confirm(XLogReaderState *record)
void heap2_redo(XLogReaderState *record)
#define XLOG_HEAP2_MULTI_INSERT
#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED
#define XLOG_HEAP_HOT_UPDATE
#define XLHP_HAS_CONFLICT_HORIZON
#define XLOG_HEAP2_REWRITE
#define XLH_LOCK_ALL_FROZEN_CLEARED
#define XLOG_HEAP_TRUNCATE
#define XLH_INSERT_ALL_FROZEN_SET
#define XLHL_XMAX_KEYSHR_LOCK
#define XLH_DELETE_ALL_VISIBLE_CLEARED
#define XLHP_HAS_NOW_UNUSED_ITEMS
#define XLHL_XMAX_IS_MULTI
#define XLHP_HAS_REDIRECTIONS
#define XLH_INSERT_ALL_VISIBLE_CLEARED
#define XLOG_HEAP2_PRUNE_VACUUM_SCAN
#define XLH_DELETE_IS_PARTITION_MOVE
#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED
#define XLHL_XMAX_LOCK_ONLY
#define XLOG_HEAP_INPLACE
#define XLOG_HEAP2_LOCK_UPDATED
#define XLH_UPDATE_SUFFIX_FROM_OLD
#define XLH_UPDATE_PREFIX_FROM_OLD
#define SizeOfMultiInsertTuple
#define XLHL_XMAX_EXCL_LOCK
#define XLOG_HEAP2_PRUNE_ON_ACCESS
#define XLOG_HEAP2_NEW_CID
#define XLHP_CLEANUP_LOCK
#define XLHP_HAS_DEAD_ITEMS
#define XLOG_HEAP2_PRUNE_VACUUM_CLEANUP
#define XLH_DELETE_IS_SUPER
#define XLHL_KEYS_UPDATED
#define XLOG_HEAP2_VISIBLE
#define XLHP_IS_CATALOG_REL
#define XLOG_HEAP_INIT_PAGE
#define XLOG_HEAP_CONFIRM
void heap_xlog_deserialize_prune_and_freeze(char *cursor, uint8 flags, int *nplans, xlhp_freeze_plan **plans, OffsetNumber **frz_offsets, int *nredirected, OffsetNumber **redirected, int *ndead, OffsetNumber **nowdead, int *nunused, OffsetNumber **nowunused)
HeapTupleHeaderData * HeapTupleHeader
static bool HeapTupleHeaderXminFrozen(const HeapTupleHeaderData *tup)
#define SizeofHeapTupleHeader
#define HEAP_KEYS_UPDATED
static bool HEAP_XMAX_IS_LOCKED_ONLY(uint16 infomask)
static void HeapTupleHeaderSetCmax(HeapTupleHeaderData *tup, CommandId cid, bool iscombo)
#define HEAP_XMAX_LOCK_ONLY
static void HeapTupleHeaderClearHotUpdated(HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetCmin(HeapTupleHeaderData *tup, CommandId cid)
#define HEAP_XMAX_IS_MULTI
#define HEAP_XMAX_COMMITTED
#define HEAP_XMAX_EXCL_LOCK
#define HEAP_XMAX_INVALID
static bool HeapTupleHeaderIsSpeculative(const HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetXmin(HeapTupleHeaderData *tup, TransactionId xid)
#define HEAP_XMAX_KEYSHR_LOCK
static void HeapTupleHeaderSetMovedPartitions(HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetXmax(HeapTupleHeaderData *tup, TransactionId xid)
static void HeapTupleHeaderSetHotUpdated(HeapTupleHeaderData *tup)
void ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs, int nmsgs, bool RelcacheInitFileInval, Oid dbid, Oid tsid)
#define ItemIdGetLength(itemId)
#define ItemIdIsNormal(itemId)
#define ItemIdGetOffset(itemId)
#define ItemIdHasStorage(itemId)
static void ItemPointerSet(ItemPointerData *pointer, BlockNumber blockNumber, OffsetNumber offNum)
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
#define InvalidOffsetNumber
#define FirstOffsetNumber
void heap_page_prune_execute(Buffer buffer, bool lp_truncate_only, OffsetNumber *redirected, int nredirected, OffsetNumber *nowdead, int ndead, OffsetNumber *nowunused, int nunused)
void heap_xlog_logical_rewrite(XLogReaderState *r)
void ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon, bool isCatalogRel, RelFileLocator locator)
SharedInvalidationMessage msgs[FLEXIBLE_ARRAY_MEMBER]
bool relcacheInitFileInval
OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]
TransactionId snapshotConflictHorizon
#define InvalidTransactionId
bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
uint8 visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, uint8 flags)
#define VISIBILITYMAP_VALID_BITS
#define VISIBILITYMAP_ALL_FROZEN
#define VISIBILITYMAP_XLOG_VALID_BITS
#define VISIBILITYMAP_XLOG_CATALOG_REL
#define XLogHintBitIsNeeded()
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
char * XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len)
void XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
#define XLogRecGetInfo(decoder)
#define XLogRecGetData(decoder)
#define XLogRecGetXid(decoder)
void FreeFakeRelcacheEntry(Relation fakerel)
XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf)
Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id)
Relation CreateFakeRelcacheEntry(RelFileLocator rlocator)
XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf)