1/*-------------------------------------------------------------------------
4 * Efficiently and reliably populate a new relation
6 * The assumption is that no other backends access the relation while we are
7 * loading it, so we can take some shortcuts. Pages already present in the
8 * indicated fork when the bulk write operation is started are not modified
9 * unless explicitly written to. Do not mix operations through the regular
10 * buffer manager and the bulk loading interface!
12 * We bypass the buffer manager to avoid the locking overhead, and call
13 * smgrextend() directly. A downside is that the pages will need to be
14 * re-read into shared buffers on first use after the build finishes. That's
15 * usually a good tradeoff for large relations, and for small relations, the
16 * overhead isn't very significant compared to creating the relation in the
19 * The pages are WAL-logged if needed. To save on WAL header overhead, we
20 * WAL-log several pages in one record.
22 * One tricky point is that because we bypass the buffer manager, we need to
23 * register the relation for fsyncing at the next checkpoint ourselves, and
24 * make sure that the relation is correctly fsync'd by us or the checkpointer
25 * even if a checkpoint happens concurrently.
28 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
29 * Portions Copyright (c) 1994, Regents of the University of California
33 * src/backend/storage/smgr/bulk_write.c
35 *-------------------------------------------------------------------------
47 #define MAX_PENDING_WRITES XLR_MAX_BLOCK_ID
59 * Bulk writer state for one relation fork.
63 /* Information about the target relation we're writing */
68 /* We keep several writes queued, and WAL-log them in batches */
72 /* Current size of the relation */
75 /* The RedoRecPtr at the time that the bulk operation started */
84 * Start a bulk write operation on a relation fork.
95 * Start a bulk write operation on a relation fork.
97 * This is like smgr_bulk_start_rel, but can be used without a relcache entry.
106 state->forknum = forknum;
107 state->use_wal = use_wal;
115 * Remember the memory context. We will use it to allocate all the
124 * Finish bulk write operation.
126 * This WAL-logs and flushes any remaining pending writes to disk, and fsyncs
127 * the relation if needed.
132 /* WAL-log and flush any remaining pages */
136 * Fsync the relation, or register it for the next checkpoint, if
141 /* Temporary relations don't need to be fsync'd, ever */
146 * This is either an unlogged relation, or a permanent relation but we
147 * skipped WAL-logging because wal_level=minimal:
149 * A) Unlogged relation
151 * Unlogged relations will go away on crash, but they need to be
152 * fsync'd on a clean shutdown. It's sufficient to call
153 * smgrregistersync(), that ensures that the checkpointer will
154 * flush it at the shutdown checkpoint. (It will flush it on the
155 * next online checkpoint too, which is not strictly necessary.)
157 * Note that the init-fork of an unlogged relation is not
158 * considered unlogged for our purposes. It's treated like a
159 * regular permanent relation. The callers will pass use_wal=true
162 * B) Permanent relation, WAL-logging skipped because wal_level=minimal
164 * This is a new relation, and we didn't WAL-log the pages as we
165 * wrote, but they need to be fsync'd before commit.
167 * We don't need to do that here, however. The fsync() is done at
168 * commit, by smgrDoPendingSyncs() (*).
170 * (*) smgrDoPendingSyncs() might decide to WAL-log the whole
171 * relation at commit instead of fsyncing it, if the relation was
172 * very small, but it's smgrDoPendingSyncs() responsibility in any
175 * We cannot distinguish the two here, so conservatively assume it's
176 * an unlogged relation. A permanent relation with wal_level=minimal
177 * would require no actions, see above.
184 * Permanent relation, WAL-logged normally.
186 * We already WAL-logged all the pages, so they will be replayed from
187 * WAL on crash. However, when we wrote out the pages, we passed
188 * skipFsync=true to avoid the overhead of registering all the writes
189 * with the checkpointer. Register the whole relation now.
191 * There is one hole in that idea: If a checkpoint occurred while we
192 * were writing the pages, it already missed fsyncing the pages we had
193 * written before the checkpoint started. A crash later on would
194 * replay the WAL starting from the checkpoint, therefore it wouldn't
195 * replay our earlier WAL records. So if a checkpoint started after
196 * the bulk write, fsync the files now.
200 * Prevent a checkpoint from starting between the GetRedoRecPtr() and
201 * smgrregistersync() calls.
209 * A checkpoint occurred and it didn't know about our writes, so
210 * fsync() the relation ourselves.
214 elog(
DEBUG1,
"flushed relation because a checkpoint occurred concurrently");
230 /* We should not see duplicated writes for the same block */
239 * Finish all the pending writes.
257 bool page_std =
true;
259 for (
int i = 0;
i < npending;
i++)
261 blknos[
i] = pending_writes[
i].
blkno;
265 * If any of the pages use !page_std, we log them all as such.
266 * That's a bit wasteful, but in practice, a mix of standard and
267 * non-standard page layout is rare. None of the built-in AMs do
270 if (!pending_writes[
i].page_std)
274 npending, blknos, pages, page_std);
277 for (
int i = 0;
i < npending;
i++)
284 if (blkno >= bulkstate->
relsize)
287 * If we have to write pages nonsequentially, fill in the space
288 * with zeroes until we come back and overwrite. This is not
289 * logically necessary on standard Unix filesystems (unwritten
290 * space will read as zeroes anyway), but it should help to avoid
291 * fragmentation. The dummy pages aren't WAL-logged though.
293 while (blkno > bulkstate->
relsize)
295 /* don't set checksum for all-zero page */
315 * Queue write of 'buf'.
317 * NB: this takes ownership of 'buf'!
319 * You are only allowed to write a given block once as part of one bulk write
337 * Allocate a new buffer which can later be written with smgr_bulk_write().
339 * There is no function to free the buffer. When you pass it to
340 * smgr_bulk_write(), it takes ownership and frees it when it's no longer
343 * This is currently implemented as a simple palloc, but could be implemented
344 * using a ring buffer or larger chunks in the future, so don't rely on it.
void PageSetChecksumInplace(Page page, BlockNumber blkno)
static void smgr_bulk_flush(BulkWriteState *bulkstate)
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
static const PGIOAlignedBlock zero_buffer
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
BulkWriteState * smgr_bulk_start_smgr(SMgrRelation smgr, ForkNumber forknum, bool use_wal)
#define MAX_PENDING_WRITES
void smgr_bulk_finish(BulkWriteState *bulkstate)
struct PendingWrite PendingWrite
static int buffer_cmp(const void *a, const void *b)
Assert(PointerIsAligned(start, uint64))
void pfree(void *pointer)
void * MemoryContextAllocAligned(MemoryContext context, Size size, Size alignto, int flags)
MemoryContext CurrentMemoryContext
#define qsort(a, b, c, d)
#define DELAY_CHKPT_START
static SMgrRelation RelationGetSmgr(Relation rel)
#define RelationNeedsWAL(relation)
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
void smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
void smgrregistersync(SMgrRelation reln, ForkNumber forknum)
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
XLogRecPtr start_RedoRecPtr
PendingWrite pending_writes[MAX_PENDING_WRITES]
RelFileLocatorBackend smgr_rlocator
XLogRecPtr GetRedoRecPtr(void)
void log_newpages(RelFileLocator *rlocator, ForkNumber forknum, int num_pages, BlockNumber *blknos, Page *pages, bool page_std)