1/*-------------------------------------------------------------------------
4 * code to create and destroy physical storage for relations
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/catalog/storage.c
14 * Some of this code used to be in storage/smgr/smgr.c, and the
15 * function names still reflect that.
17 *-------------------------------------------------------------------------
43 * We keep a list of all relations (represented as RelFileLocator values)
44 * that have been created or deleted in the current transaction. When
45 * a relation is created, we create the physical file immediately, but
46 * remember it so that we can delete the file again if the current
47 * transaction is aborted. Conversely, a deletion request is NOT
48 * executed immediately, but is just entered in the list. When and if
49 * the transaction commits, we can delete the physical file.
51 * To handle subtransactions, every entry is marked with its transaction
52 * nesting level. At subtransaction commit, we reassign the subtransaction's
53 * entries to the parent nesting level. At subtransaction abort, we can
54 * immediately execute the abort-time actions for all entries of the current
57 * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
58 * unbetimes. It'd probably be OK to keep it in TopTransactionContext,
59 * but I'm being paranoid.
66 bool atCommit;
/* T=delete at commit; F=delete at abort */
83 * Queue an at-commit fsync.
91 /* create the hash if not yet */
109 * RelationCreateStorage
110 * Create physical storage for a relation.
112 * Create the underlying disk file storage for the relation. This only
113 * creates the main fork; additional forks are created lazily by the
114 * modules that need them.
116 * This function is transactional. The creation is WAL-logged, and if the
117 * transaction aborts later on, the storage will be destroyed. A caller
118 * that does not want the storage to be destroyed in case of an abort may
119 * pass register_delete = false.
123 bool register_delete)
131 switch (relpersistence)
133 case RELPERSISTENCE_TEMP:
137 case RELPERSISTENCE_UNLOGGED:
141 case RELPERSISTENCE_PERMANENT:
146 elog(
ERROR,
"invalid relpersistence: %c", relpersistence);
147 return NULL;
/* placate compiler */
150 srel =
smgropen(rlocator, procNumber);
157 * Add the relation to the list of stuff to delete at abort, if we are
168 pending->
atCommit =
false;
/* delete if abort */
174 if (relpersistence == RELPERSISTENCE_PERMANENT && !
XLogIsNeeded())
184 * Perform XLogInsert of an XLOG_SMGR_CREATE record to WAL.
192 * Make an XLOG entry reporting the file creation.
203 * RelationDropStorage
204 * Schedule unlinking of physical storage at transaction commit.
211 /* Add the relation to the list of stuff to delete at commit */
216 pending->
atCommit =
true;
/* delete if commit */
222 * NOTE: if the relation was created in this transaction, it will now be
223 * present in the pending-delete list twice, once with atCommit true and
224 * once with atCommit false. Hence, it will be physically deleted at end
225 * of xact in either case (and the other entry will be ignored by
226 * smgrDoPendingDeletes, so no error will occur). We could instead remove
227 * the existing list entry and delete the physical file immediately, but
228 * for now I'll keep the logic simple.
235 * RelationPreserveStorage
236 * Mark a relation as not to be deleted after all.
238 * We need this function because relation mapping changes are committed
239 * separately from commit of the whole transaction, so it's still possible
240 * for the transaction to abort after the mapping update is done.
241 * When a new physical relation is installed in the map, it would be
242 * scheduled for delete-on-abort, so we'd delete it, and be in trouble.
243 * The relation mapper fixes this by telling us to not delete such relations
244 * after all as part of its commit.
246 * We also use this to reuse an old build of an index during ALTER TABLE, this
247 * time removing the delete-at-commit entry.
249 * No-op if the relation is not among those scheduled for deletion.
265 /* unlink and delete list entry */
271 /* prev does not change */
275 /* unrelated entry, don't touch it */
283 * Physically truncate a relation to the specified number of blocks.
285 * This includes getting rid of any buffers for the blocks that are to be
293 bool need_fsm_vacuum =
false;
301 * Make sure smgr_targblock etc aren't pointing somewhere past new end.
302 * (Note: don't rely on this reln pointer below this loop.)
309 /* Prepare for truncation of MAIN fork of the relation */
312 blocks[nforks] = nblocks;
315 /* Prepare for truncation of the FSM if it exists */
325 need_fsm_vacuum =
true;
329 /* Prepare for truncation of the visibility map too if it exists */
345 * The code which follows can interact with concurrent checkpoints in two
348 * First, the truncation operation might drop buffers that the checkpoint
349 * otherwise would have flushed. If it does, then it's essential that the
350 * files actually get truncated on disk before the checkpoint record is
351 * written. Otherwise, if reply begins from that checkpoint, the
352 * to-be-truncated blocks might still exist on disk but have older
353 * contents than expected, which can cause replay to fail. It's OK for the
354 * blocks to not exist on disk at all, but not for them to have the wrong
355 * contents. For this reason, we need to set DELAY_CHKPT_COMPLETE while
356 * this code executes.
358 * Second, the call to smgrtruncate() below will in turn call
359 * RegisterSyncRequest(). We need the sync request created by that call to
360 * be processed before the checkpoint completes. CheckPointGuts() will
361 * call ProcessSyncRequests(), but if we register our sync request after
362 * that happens, then the WAL record for the truncation could end up
363 * preceding the checkpoint record, while the actual sync doesn't happen
364 * until the next checkpoint. To prevent that, we need to set
365 * DELAY_CHKPT_START here. That way, if the XLOG_SMGR_TRUNCATE precedes
366 * the redo pointer of a concurrent checkpoint, we're guaranteed that the
367 * corresponding sync request will be processed before the checkpoint
374 * We WAL-log the truncation first and then truncate in a critical
375 * section. Truncation drops buffers, even if dirty, and then truncates
376 * disk files. All of that work needs to complete before the lock is
377 * released, or else old versions of pages on disk that are missing recent
378 * changes would become accessible again. We'll try the whole operation
379 * again in crash recovery if we panic, but even then we can't give up
380 * because we don't want standbys' relation sizes to diverge and break
381 * replay or visibility invariants downstream. The critical section also
382 * suppresses interrupts.
384 * (See also visibilitymap.c if changing this code.)
391 * Make an XLOG entry reporting the file truncation.
396 xlrec.
blkno = nblocks;
407 * Flush, because otherwise the truncation of the main relation might
408 * hit the disk before the WAL record, and the truncation of the FSM
409 * or visibility map. If we crashed during that window, we'd be left
410 * with a truncated heap, but the FSM or visibility map would still
411 * contain entries for the non-existent heap pages, and standbys would
412 * also never replay the truncation.
418 * This will first remove any buffers from the buffer pool that should no
419 * longer exist after truncation is complete, and then truncate the
420 * corresponding files on disk.
426 /* We've done all the critical work, so checkpoints are OK now. */
430 * Update upper-level FSM pages to account for the truncation. This is
431 * important because the just-truncated pages were likely marked as
432 * all-free, and would be preferentially selected.
434 * NB: There's no point in delaying checkpoints until this is done.
435 * Because the FSM is not WAL-logged, we have to be prepared for the
436 * possibility of corruption after a crash anyway.
443 * RelationPreTruncate
444 * Perform AM-independent work before a physical truncation.
446 * If an access method's relation_nontransactional_truncate does not call
447 * RelationTruncate(), it must call this before decreasing the table size.
465 * Copy a fork's data, block by block.
467 * Note that this requires that there is no dirty data in shared buffers. If
468 * it's possible that there are, callers need to flush those using
469 * e.g. FlushRelationBuffers(rel).
471 * Also note that this is frequently called via locutions such as
472 * RelationCopyStorage(RelationGetSmgr(rel), ...);
473 * That's safe only because we perform only smgr and WAL operations here.
474 * If we invoked anything else, a relcache flush could cause our SMgrRelation
475 * argument to become a dangling pointer.
482 bool copying_initfork;
488 * The init fork for an unlogged relation in many respects has to be
489 * treated the same as normal relation, changes need to be WAL logged and
490 * it needs to be synced to disk.
492 copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
496 * We need to log the copied data in WAL iff WAL archiving/streaming is
497 * enabled AND it's a permanent relation. This gives the same answer as
498 * "RelationNeedsWAL(rel) || copying_initfork", because we know the
499 * current operation created new relation storage.
502 (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
508 for (blkno = 0; blkno < nblocks; blkno++)
515 /* If we got a cancel signal during the copy of the data, quit */
537 * For paranoia's sake, capture the file path before invoking the
538 * ereport machinery. This guards against the possibility of a
539 * relcache flush caused by, e.g., an errcontext callback.
540 * (errcontext callbacks shouldn't be risking any such thing, but
541 * people have been known to forget that rule.)
549 errmsg(
"invalid page in block %u of relation \"%s\"",
554 * Queue the page for WAL-logging and writing out. Unfortunately we
555 * don't know what kind of a page this is, so we have to log the full
556 * page including any unused space.
564 * RelFileLocatorSkippingWAL
565 * Check if a BM_PERMANENT relfilelocator is using WAL.
567 * Changes to certain relations must not write WAL; see "Skipping WAL for
568 * New RelFileLocator" in src/backend/access/transam/README. Though it is
569 * known from Relation efficiently, this function is intended for the code
570 * paths not having access to Relation.
583 * EstimatePendingSyncsSpace
584 * Estimate space needed to pass syncs to parallel workers.
596 * SerializePendingSyncs
597 * Serialize syncs for parallel workers.
613 /* Create temporary hash to collect active relfilelocators */
621 /* collect all rlocator from pending syncs */
626 /* remove deleted rnodes */
628 if (delete->atCommit)
643 * RestorePendingSyncs
644 * Restore syncs within a parallel worker.
646 * RelationNeedsWAL() and RelFileLocatorSkippingWAL() must offer the correct
647 * answer to parallel workers. Only smgrDoPendingSyncs() reads the
648 * is_truncated field, at end of transaction. Hence, don't restore it.
662 * smgrDoPendingDeletes() -- Take care of relation deletes at end of xact.
664 * This also runs when aborting a subxact; we want to clean up a failed
665 * subxact immediately.
667 * Note: It's possible that we're being asked to remove a relation that has
668 * no physical storage in any fork. In particular, it's possible that we're
669 * cleaning up an old temporary relation for which RemovePgTempFiles has
670 * already recovered the physical storage.
689 /* outer-level entries should not be processed yet */
694 /* unlink list entry first, so we don't retry on failure */
699 /* do deletion if called for */
706 /* allocate the initial array, or extend it, if needed */
712 else if (maxrels <= nrels)
718 srels[nrels++] = srel;
720 /* must explicitly free the list entry */
722 /* prev does not change */
730 for (
int i = 0;
i < nrels;
i++)
738 * smgrDoPendingSyncs() -- Take care of relation syncs at end of xact.
753 return;
/* no relation needs sync */
755 /* Abort -- just throw away all pending syncs */
764 /* Parallel worker -- just throw away all pending syncs */
765 if (isParallelWorker)
771 /* Skip syncing nodes that smgrDoPendingDeletes() will delete. */
788 * We emit newpage WAL records for smaller relations.
790 * Small WAL records have a chance to be flushed along with other
791 * backends' WAL records. We emit WAL records instead of syncing for
792 * files that are smaller than a certain threshold, expecting faster
793 * commit. The threshold is defined by the GUC wal_skip_threshold.
803 /* we shouldn't come here for unlogged relations */
814 * Sync file or emit WAL records for its contents.
816 * Although we emit WAL record if the file is small enough, do file
817 * sync regardless of the size if the file has experienced a
818 * truncation. It is because the file would be followed by trailing
819 * garbage blocks after a crash recovery if, while a past longer file
820 * had been flushed out, we omitted syncing-out of the file and
821 * emitted WAL instead. You might think that we could choose WAL if
822 * the current main fork is longer than ever, but there's a case where
823 * main fork is longer than ever but FSM fork gets shorter.
828 /* allocate the initial array, or extend it, if needed */
834 else if (maxrels <= nrels)
840 srels[nrels++] = srel;
844 /* Emit WAL records for all blocks. The file is small enough. */
847 int n = nblocks[fork];
854 * Emit WAL for the whole file. Unfortunately we don't know
855 * what kind of a page this is, so we have to log the full
856 * page including any unused space. ReadBufferExtended()
857 * counts some pgstat events; unfortunately, we discard them.
876 * smgrGetPendingDeletes() -- Get a list of non-temp relations to be deleted.
878 * The return value is the number of relations scheduled for termination.
879 * *ptr is set to point to a freshly-palloc'd array of RelFileLocators.
880 * If there are no relations to be deleted, *ptr is set to NULL.
882 * Only non-temporary relations are included in the returned list. This is OK
883 * because the list is used only in contexts where temporary relations don't
884 * matter: we're either writing to the two-phase state file (and transactions
885 * that have touched temp tables can't be prepared) or we're writing to xlog
886 * (and all temporary files will be zapped if we restart anyway, so no need
887 * for redo to do it also).
889 * Note that the list does not include anything scheduled for termination
890 * by upper-level transactions.
927 * PostPrepare_smgr -- Clean up after a successful PREPARE
929 * What we have to do here is throw away the in-memory state about pending
930 * relation deletes. It's all been recorded in the 2PC state file and
931 * it's no longer smgr's job to worry about it.
943 /* must explicitly free the list entry */
950 * AtSubCommit_smgr() --- Take care of subtransaction commit.
952 * Reassign all items in the pending-deletes list to the parent transaction.
968 * AtSubAbort_smgr() --- Take care of subtransaction abort.
970 * Delete created relations and forget about deleted relations.
971 * We can execute these operations immediately because we know this
972 * subtransaction will not commit.
986 /* Backup blocks are not used in smgr records */
1006 bool need_fsm_vacuum =
false;
1011 * Forcibly create relation if it doesn't exist (which suggests that
1012 * it was dropped somewhere later in the WAL sequence). As in
1013 * XLogReadBufferForRedo, we prefer to recreate the rel and replay the
1014 * log as best we can until the drop is seen.
1019 * Before we perform the truncation, update minimum recovery point to
1020 * cover this WAL record. Once the relation is truncated, there's no
1021 * going back. The buffer manager enforces the WAL-first rule for
1022 * normal updates to relation files, so that the minimum recovery
1023 * point is always updated before the corresponding change in the data
1024 * file is flushed to disk. We have to do the same manually here.
1026 * Doing this before the truncation means that if the truncation fails
1027 * for some reason, you cannot start up the system even after restart,
1028 * until you fix the underlying situation so that the truncation will
1029 * succeed. Alternatively, we could update the minimum recovery point
1030 * after truncation, but that would leave a small window where the
1031 * WAL-first rule could be violated.
1035 /* Prepare for truncation of MAIN fork */
1040 blocks[nforks] = xlrec->
blkno;
1043 /* Also tell xlogutils.c about it */
1047 /* Prepare for truncation of FSM and VM too */
1059 need_fsm_vacuum =
true;
1074 /* Do the real work to truncate relation forks */
1083 * Update upper-level FSM pages to account for the truncation. This is
1084 * important because the just-truncated pages were likely marked as
1085 * all-free, and would be preferentially selected.
1087 if (need_fsm_vacuum)
1094 elog(
PANIC,
"smgr_redo: unknown op code %u", info);
#define InvalidBlockNumber
static bool BlockNumberIsValid(BlockNumber blockNumber)
bool ignore_checksum_failure
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
#define PIV_IGNORE_CHECKSUM_FAILURE
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
BulkWriteState * smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
void smgr_bulk_finish(BulkWriteState *bulkstate)
#define MemSet(start, val, len)
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
void hash_destroy(HTAB *hashp)
void * hash_seq_search(HASH_SEQ_STATUS *status)
int64 hash_get_num_entries(HTAB *hashp)
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
BlockNumber FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
Assert(PointerIsAligned(start, uint64))
void * MemoryContextAlloc(MemoryContext context, Size size)
MemoryContext TopTransactionContext
void * repalloc(void *pointer, Size size)
void pfree(void *pointer)
MemoryContext TopMemoryContext
MemoryContext CurrentMemoryContext
#define START_CRIT_SECTION()
#define CHECK_FOR_INTERRUPTS()
#define END_CRIT_SECTION()
static bool checksum_failure
#define ERRCODE_DATA_CORRUPTED
void pgstat_prepare_report_checksum_failure(Oid dboid)
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
#define DELAY_CHKPT_START
#define DELAY_CHKPT_COMPLETE
#define INVALID_PROC_NUMBER
#define ProcNumberForTempRelations()
static SMgrRelation RelationGetSmgr(Relation rel)
#define RelationNeedsWAL(relation)
static void RelationCloseSmgr(Relation relation)
#define AssertPendingSyncs_RelationCache()
struct RelFileLocator RelFileLocator
#define RelFileLocatorEquals(locator1, locator2)
#define relpath(rlocator, forknum)
#define relpathbackend(rlocator, backend, forknum)
Size mul_size(Size s1, Size s2)
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
void smgrdosyncall(SMgrRelation *rels, int nrels)
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
void smgrclose(SMgrRelation reln)
void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *old_nblocks, BlockNumber *nblocks)
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
static void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer)
void RelationPreserveStorage(RelFileLocator rlocator, bool atCommit)
void RelationPreTruncate(Relation rel)
struct PendingRelDelete PendingRelDelete
void SerializePendingSyncs(Size maxSize, char *startAddress)
void AtSubCommit_smgr(void)
Size EstimatePendingSyncsSpace(void)
static HTAB * pendingSyncHash
struct PendingRelSync PendingRelSync
static PendingRelDelete * pendingDeletes
void AtSubAbort_smgr(void)
void RelationCopyStorage(SMgrRelation src, SMgrRelation dst, ForkNumber forkNum, char relpersistence)
void smgr_redo(XLogReaderState *record)
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
int smgrGetPendingDeletes(bool forCommit, RelFileLocator **ptr)
void PostPrepare_smgr(void)
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
static void AddPendingSync(const RelFileLocator *rlocator)
void RestorePendingSyncs(char *startAddress)
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
void RelationDropStorage(Relation rel)
void RelationTruncate(Relation rel, BlockNumber nblocks)
void smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
void smgrDoPendingDeletes(bool isCommit)
#define SMGR_TRUNCATE_ALL
#define XLOG_SMGR_TRUNCATE
#define SMGR_TRUNCATE_HEAP
#define SMGR_TRUNCATE_FSM
struct PendingRelDelete * next
RelFileLocator rd_locator
BlockNumber smgr_targblock
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
RelFileLocatorBackend smgr_rlocator
BlockNumber visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks)
int GetCurrentTransactionNestLevel(void)
bool IsInParallelMode(void)
void XLogFlush(XLogRecPtr record)
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
void XLogRegisterData(const void *data, uint32 len)
void log_newpage_range(Relation rel, ForkNumber forknum, BlockNumber startblk, BlockNumber endblk, bool page_std)
void XLogBeginInsert(void)
#define XLogRecGetInfo(decoder)
#define XLogRecGetData(decoder)
#define XLogRecHasAnyBlockRefs(decoder)
#define XLR_SPECIAL_REL_UPDATE
void FreeFakeRelcacheEntry(Relation fakerel)
void XLogTruncateRelation(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nblocks)
Relation CreateFakeRelcacheEntry(RelFileLocator rlocator)