1/*-------------------------------------------------------------------------
4 * WAL replay logic for btrees.
7 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/nbtree/nbtxlog.c
13 *-------------------------------------------------------------------------
28 * _bt_restore_page -- re-enter all the index tuples on a page
30 * The page is freshly init'd, and *from (length len) is a copy of what
31 * had been its upper part (pd_upper to pd_special). We assume that the
32 * tuples had been added to the page in item-number order, and therefore
33 * the one with highest item number appears first (lowest on the page).
40 char *end = from +
len;
47 * To get the items back in the original order, we add them to the page in
48 * reverse. To figure out where one tuple ends and another begins, we
49 * have to scan them in forward order first.
55 * As we step through the items, 'from' won't always be properly
56 * aligned, so we need to use memcpy(). Further, we use Item (which
57 * is just a char*) here for our items array for the same reason;
58 * wouldn't want the compiler or anyone thinking that an item is
59 * aligned when it isn't.
66 itemsizes[
i] = itemsz;
77 elog(
PANIC,
"_bt_restore_page: cannot add item to page");
110 /* Cannot log BTREE_MIN_VERSION index metapage without upgrade */
120 * Set pd_lower just past the end of the metadata. This is essential,
121 * because without doing so, metadata will be lost if xlog.c compresses
133 * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
135 * This is a common subroutine of the redo functions of all the WAL record
136 * types that can insert a downlink: insert, split, and newroot.
169 * Insertion to an internal page finishes an incomplete split at the child
170 * level. Clear the incomplete-split flag in the child. Note: during
171 * normal operation, the child and parent pages are locked at the same
172 * time (the locks are coupled), so that clearing the flag and inserting
173 * the downlink appear atomic to other backends. We don't bother with
174 * that during replay, because readers don't care about the
175 * incomplete-split flag and there cannot be updates happening.
188 /* Simple retail insertion */
202 * A posting list split occurred during leaf page insertion. WAL
203 * record data will start with an offset number representing the
204 * point in an existing posting list that a split occurs at.
206 * Use _bt_swap_posting() to repeat posting list split steps from
207 * primary. Note that newitem from WAL record is 'orignewitem',
208 * not the final version of newitem that is actually inserted on
211 postingoff = *((
uint16 *) datapos);
212 datapos +=
sizeof(
uint16);
213 datalen -=
sizeof(
uint16);
218 /* Use mutable, aligned newitem copy in _bt_swap_posting() */
219 Assert(isleaf && postingoff > 0);
223 /* Replace existing posting list with post-split version */
226 /* Insert "final" new item (not orignewitem from WAL stream) */
230 elog(
PANIC,
"failed to add posting split new item");
240 * Note: in normal operation, we'd update the metapage while still holding
241 * lock on the page we inserted into. But during replay it's not
242 * necessary to hold that lock, since no other index updates can be
243 * happening concurrently, and readers will cope fine with following an
244 * obsolete link from the metapage.
255 bool isleaf = (xlrec->
level == 0);
272 * Clear the incomplete split flag on the appropriate child page one level
273 * down when origpage/buf is an internal page (there must have been
274 * cascading page splits during original execution in the event of an
275 * internal page split). This is like the corresponding btree_xlog_insert
276 * call for internal pages. We're not clearing the incomplete split flag
277 * for the current page split here (you can think of this as part of the
278 * insert of newitem that the page split action needs to perform in
281 * Like in btree_xlog_insert, this can be done before locking other pages.
282 * We never need to couple cross-level locks in REDO routines.
287 /* Reconstruct right (new) sibling page from scratch */
306 /* Now reconstruct original page (left half of split) */
310 * To retain the same physical order of the tuples that they had, we
311 * initialize a temporary empty page for the left page and add all the
312 * items to that in item number order. This mirrors how _bt_split()
313 * works. Retaining the same physical order makes WAL consistency
314 * checking possible. See also _bt_restore_page(), which does the
315 * same for the right page.
335 datapos += newitemsz;
336 datalen -= newitemsz;
343 /* Posting list must be at offset number before new item's */
346 /* Use mutable, aligned newitem copy in _bt_swap_posting() */
356 * Extract left hikey and its size. We assume that 16-bit alignment
357 * is enough to apply IndexTupleSize (since it's fetching from a
362 datapos += left_hikeysz;
363 datalen -= left_hikeysz;
369 /* Add high key tuple from WAL record to temp page */
373 elog(
ERROR,
"failed to add high key to left page after split");
382 /* Add replacement posting list when required */
383 if (off == replacepostingoff)
390 elog(
ERROR,
"failed to add new posting list item to left page after split");
392 continue;
/* don't insert oposting */
395 /* add the new item if it was inserted on left page */
396 else if (newitemonleft && off == xlrec->
newitemoff)
400 elog(
ERROR,
"failed to add new item to left page after split");
409 elog(
ERROR,
"failed to add old item to left page after split");
413 /* cope with possibility that newitem goes at the end */
414 if (newitemonleft && off == xlrec->
newitemoff)
418 elog(
ERROR,
"failed to add new item to left page after split");
424 /* Fix opaque fields */
435 /* Fix left-link of the page to the right of the new right sibling */
436 if (spagenumber !=
P_NONE)
455 * Finally, release the remaining buffers. sbuf, rbuf, and buf must be
456 * released together, so that readers cannot observe inconsistencies.
483 state->deduplicate =
true;
/* unused */
484 state->nmaxitems = 0;
/* unused */
485 /* Conservatively use larger maxpostingsize than primary */
489 state->basetupsize = 0;
493 state->phystupsize = 0;
494 state->nintervals = 0;
508 elog(
ERROR,
"deduplication failed to add highkey");
512 for (offnum = minoff;
519 if (offnum == minoff)
526 elog(
ERROR,
"deduplication failed to add heap tid to pending posting list");
565 for (
int i = 0;
i < nupdated;
i++)
573 vacposting->
itup = origtuple;
581 /* Overwrite updated version of tuple */
585 elog(
PANIC,
"failed to update partially dead item");
590 /* advance to next xl_btree_update from array */
607 * We need to take a cleanup lock here, just like btvacuumpage(). However,
608 * it isn't necessary to exhaustively get a cleanup lock on every block in
609 * the index during recovery (just getting a cleanup lock on pages with
610 * items to kill suffices). See nbtree/README for details.
637 * Clear the vacuum cycle ID, and mark the page as not containing any
661 * If we have any conflict processing to do, it must happen before we
676 * We don't need to take a cleanup lock to apply these changes. See
677 * nbtree/README for details.
703 * Do *not* clear the vacuum cycle ID, but do mark the page as not
704 * containing any LP_DEAD items
727 * In normal operation, we would lock all the pages this WAL record
728 * touches before changing any of them. In WAL replay, it should be okay
729 * to lock just one page at a time, since no concurrent index updates can
730 * be happening, and readers should not care whether they arrive at the
731 * target page or not (since it's surely empty).
734 /* to-be-deleted subtree's parent page */
764 * Don't need to couple cross-level locks in REDO routines, so release
765 * lock on internal page immediately
770 /* Rewrite the leaf page as a halfdead page */
784 * Construct a dummy high key item that points to top parent page (value
785 * is InvalidBlockNumber when the top parent page is the leaf page itself)
793 elog(
ERROR,
"could not add dummy high key to half-dead page");
819 level = xlrec->
level;
820 isleaf = (level == 0);
823 /* No leaftopparent for level 0 (leaf page) or level 1 target */
827 * In normal operation, we would lock all the pages this WAL record
828 * touches before changing any of them. In WAL replay, we at least lock
829 * the pages in the same standard left-to-right order (leftsib, target,
830 * rightsib), and don't release the sibling locks until the target is
834 /* Fix right-link of left sibling, if any */
850 /* Rewrite target page as empty deleted page */
868 /* Fix left-link of right sibling */
879 /* Release siblings */
889 * If we deleted a parent of the targeted leaf page, instead of the leaf
890 * itself, update the leaf to point to the next remaining child in the
891 * to-be-deleted subtree
896 * There is no real data on the page, so we just re-create it from
897 * scratch using the information from the WAL record.
899 * Note that we don't end up here when the target page is also the
900 * leafbuf page. There is no need to add a dummy hikey item with a
901 * top parent link when deleting leafbuf because it's the last page
902 * we'll delete in the subtree undergoing deletion.
921 /* Add a dummy hikey item */
928 elog(
ERROR,
"could not add dummy high key to half-dead page");
935 /* Update metapage if needed */
960 if (xlrec->
level == 0)
964 if (xlrec->
level > 0)
969 /* Clear the incomplete-split flag in left child */
981 * In general VACUUM must defer recycling as a way of avoiding certain race
982 * conditions. Deleted pages contain a safexid value that is used by VACUUM
983 * to determine whether or not it's safe to place a page that was deleted by
984 * VACUUM earlier into the FSM now. See nbtree/README.
986 * As far as any backend operating during original execution is concerned, the
987 * FSM is a cache of recycle-safe pages; the mere presence of the page in the
988 * FSM indicates that the page must already be safe to recycle (actually,
989 * _bt_allocbuf() verifies it's safe using BTPageIsRecyclable(), but that's
990 * just because it would be unwise to completely trust the FSM, given its
991 * current limitations).
993 * This isn't sufficient to prevent similar concurrent recycling race
994 * conditions during Hot Standby, though. For that we need to log a
995 * xl_btree_reuse_page record at the point that a page is actually recycled
996 * and reused for an entirely unrelated page inside _bt_split(). These
997 * records include the same safexid value from the original deleted page,
998 * stored in the record's snapshotConflictHorizon field.
1000 * The GlobalVisCheckRemovableFullXid() test in BTPageIsRecyclable() is used
1001 * to determine if it's safe to recycle a page. This mirrors our own test:
1002 * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs().
1003 * Consequently, one XID value achieves the same exclusion effect on primary
1070 elog(
PANIC,
"btree_redo: unknown op code %u", info);
1080 "Btree recovery temporary context",
1092 * Mask a btree page before performing consistency checks on it.
1110 * In btree leaf pages, it is possible to modify the LP_FLAGS without
1111 * emitting any WAL record. Hence, mask the line pointer flags. See
1112 * _bt_killitems(), _bt_check_unique() for details.
1118 * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See
1119 * _bt_delete_or_dedup_one_page(), _bt_killitems(), and _bt_check_unique()
1125 * During replay of a btree page split, we don't set the BTP_SPLIT_END
1126 * flag of the right sibling and initialize the cycle_id to 0 for the same
1127 * page. See btree_xlog_split() for details.
static bool BlockNumberIsValid(BlockNumber blockNumber)
void mask_lp_flags(Page page)
void mask_page_lsn_and_checksum(Page page)
void mask_unused_space(Page page)
void mask_page_hint_bits(Page page)
BlockNumber BufferGetBlockNumber(Buffer buffer)
void UnlockReleaseBuffer(Buffer buffer)
void MarkBufferDirty(Buffer buffer)
static Page BufferGetPage(Buffer buffer)
static Size BufferGetPageSize(Buffer buffer)
static bool BufferIsValid(Buffer bufnum)
void PageRestoreTempPage(Page tempPage, Page oldPage)
void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
void PageIndexTupleDelete(Page page, OffsetNumber offnum)
Page PageGetTempPageCopySpecial(const PageData *page)
PageHeaderData * PageHeader
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
static void PageSetLSN(Page page, XLogRecPtr lsn)
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
#define MemSet(start, val, len)
Assert(PointerIsAligned(start, uint64))
IndexTuple CopyIndexTuple(IndexTuple source)
#define ItemIdGetLength(itemId)
IndexTupleData * IndexTuple
struct IndexTupleData IndexTupleData
static Size IndexTupleSize(const IndexTupleData *itup)
#define MaxIndexTuplesPerPage
void MemoryContextReset(MemoryContext context)
void pfree(void *pointer)
MemoryContext CurrentMemoryContext
void MemoryContextDelete(MemoryContext context)
#define AllocSetContextCreate
#define ALLOCSET_DEFAULT_SIZES
IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, int postingoff)
void _bt_update_posting(BTVacuumPosting vacposting)
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
Size _bt_dedup_finish_pending(Page newpage, BTDedupState state)
void _bt_pageinit(Page page, Size size)
#define P_HAS_GARBAGE(opaque)
static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)
#define BTPageGetOpaque(page)
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
#define P_FIRSTDATAKEY(opaque)
#define P_RIGHTMOST(opaque)
#define P_INCOMPLETE_SPLIT(opaque)
#define BTP_INCOMPLETE_SPLIT
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
static void BTPageSetDeleted(Page page, FullTransactionId safexid)
#define BTREE_NOVAC_VERSION
BTDedupStateData * BTDedupState
static void btree_xlog_delete(XLogReaderState *record)
void btree_redo(XLogReaderState *record)
static void _bt_restore_meta(XLogReaderState *record, uint8 block_id)
void btree_xlog_cleanup(void)
static void btree_xlog_newroot(XLogReaderState *record)
static void btree_xlog_updates(Page page, OffsetNumber *updatedoffsets, xl_btree_update *updates, int nupdated)
static void btree_xlog_dedup(XLogReaderState *record)
static void btree_xlog_insert(bool isleaf, bool ismeta, bool posting, XLogReaderState *record)
static void btree_xlog_split(bool newitemonleft, XLogReaderState *record)
static void btree_xlog_reuse_page(XLogReaderState *record)
static void _bt_clear_incomplete_split(XLogReaderState *record, uint8 block_id)
static void btree_xlog_mark_page_halfdead(uint8 info, XLogReaderState *record)
static void _bt_restore_page(Page page, char *from, int len)
void btree_mask(char *pagedata, BlockNumber blkno)
static MemoryContext opCtx
void btree_xlog_startup(void)
static void btree_xlog_vacuum(XLogReaderState *record)
static void btree_xlog_unlink_page(uint8 info, XLogReaderState *record)
#define XLOG_BTREE_META_CLEANUP
#define XLOG_BTREE_INSERT_POST
#define SizeOfBtreeUpdate
#define XLOG_BTREE_VACUUM
#define XLOG_BTREE_SPLIT_R
#define XLOG_BTREE_INSERT_LEAF
#define XLOG_BTREE_INSERT_UPPER
#define XLOG_BTREE_UNLINK_PAGE
#define XLOG_BTREE_UNLINK_PAGE_META
#define XLOG_BTREE_INSERT_META
#define XLOG_BTREE_MARK_PAGE_HALFDEAD
#define XLOG_BTREE_REUSE_PAGE
#define XLOG_BTREE_SPLIT_L
#define XLOG_BTREE_NEWROOT
#define XLOG_BTREE_DELETE
#define InvalidOffsetNumber
#define OffsetNumberNext(offsetNumber)
#define OffsetNumberPrev(offsetNumber)
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
void ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon, bool isCatalogRel, RelFileLocator locator)
void ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon, bool isCatalogRel, RelFileLocator locator)
uint32 btm_last_cleanup_num_delpages
float8 btm_last_cleanup_num_heap_tuples
uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]
OffsetNumber updatedoffset
TransactionId snapshotConflictHorizon
FullTransactionId snapshotConflictHorizon
OffsetNumber firstrightoff
FullTransactionId safexid
BlockNumber leaftopparent
bool XLogRecGetBlockTagExtended(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum, Buffer *prefetch_buffer)
char * XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len)
void XLogRecGetBlockTag(XLogReaderState *record, uint8 block_id, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
#define XLogRecGetInfo(decoder)
#define XLogRecGetData(decoder)
#define XLogRecHasBlockRef(decoder, block_id)
XLogRedoAction XLogReadBufferForRedo(XLogReaderState *record, uint8 block_id, Buffer *buf)
Buffer XLogInitBufferForRedo(XLogReaderState *record, uint8 block_id)
XLogRedoAction XLogReadBufferForRedoExtended(XLogReaderState *record, uint8 block_id, ReadBufferMode mode, bool get_cleanup_lock, Buffer *buf)