1/*-------------------------------------------------------------------------
4 * File synchronization management code.
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/storage/sync/sync.c
13 *-------------------------------------------------------------------------
36 * In some contexts (currently, standalone backends and the checkpointer)
37 * we keep track of pending fsync operations: we need to remember all relation
38 * segments that have been written since the last checkpoint, so that we can
39 * fsync them down to disk before completing the next checkpoint. This hash
40 * table remembers the pending operations. We use a hash table mostly as
41 * a convenient way of merging duplicate requests.
43 * We use a similar mechanism to remember no-longer-needed files that can
44 * be deleted after the next checkpoint, but we use a linked list instead of
45 * a hash table, because we don't expect there to be any duplicate requests.
47 * These mechanisms are only used for non-temp relations; we never fsync
48 * temp rels, nor do we need to postpone their deletion (see comments in
51 * (Regular backends do not track pending operations locally, but forward
52 * them to the checkpointer.)
60 bool canceled;
/* canceled is true if we canceled "recently" */
67 bool canceled;
/* true if request has been canceled */
77/* Intervals for calling AbsorbSyncRequests */
78 #define FSYNCS_PER_ABSORB 10
79 #define UNLINKS_PER_ABSORB 10
82 * Function pointers for handling sync and unlink requests.
93 * These indexes must correspond to the values of the SyncRequestHandler enum.
110 /* pg_multixact/offsets */
114 /* pg_multixact/members */
121 * Initialize data structures for the file sync tracking.
127 * Create pending-operations hashtable if we need it. Currently, we need
128 * it if we are standalone (not under a postmaster) or if we are a
129 * checkpointer auxiliary process.
136 * XXX: The checkpointer needs to add entries to the pending ops table
137 * when absorbing fsync requests. That is done within a critical
138 * section, which isn't usually allowed, but we make an exception. It
139 * means that there's a theoretical possibility that you run out of
140 * memory while absorbing fsync requests, which leads to a PANIC.
141 * Fortunately the hash table is small so that's unlikely to happen in
145 "Pending ops context",
161 * SyncPreCheckpoint() -- Do pre-checkpoint work
163 * To distinguish unlink requests that arrived before this checkpoint
164 * started from those that arrived during the checkpoint, we use a cycle
165 * counter similar to the one we use for fsync requests. That cycle
166 * counter is incremented here.
168 * This must be called *before* the checkpoint REDO point is determined.
169 * That ensures that we won't delete files too soon. Since this calls
170 * AbsorbSyncRequests(), which performs memory allocations, it cannot be
171 * called within a critical section.
173 * Note that we can't do anything here that depends on the assumption
174 * that the checkpoint will be completed.
180 * Operations such as DROP TABLESPACE assume that the next checkpoint will
181 * process all recently forwarded unlink requests, but if they aren't
182 * absorbed prior to advancing the cycle counter, they won't be processed
183 * until a future checkpoint. The following absorb ensures that any
184 * unlink requests forwarded before the checkpoint began will be processed
185 * in the current checkpoint.
190 * Any unlink requests arriving after this point will be assigned the next
191 * cycle counter, and won't be unlinked until next checkpoint.
197 * SyncPostCheckpoint() -- Do post-checkpoint work
199 * Remove any lingering files that can now be safely removed.
213 /* Skip over any canceled entries */
218 * New entries are appended to the end, so if the entry is new we've
219 * reached the end of old entries.
221 * Note: if just the right number of consecutive checkpoints fail, we
222 * could be fooled here by cycle_ctr wraparound. However, the only
223 * consequence is that we'd delay unlinking for one more checkpoint,
224 * which is perfectly tolerable.
229 /* Unlink the file */
234 * There's a race condition, when the database is dropped at the
235 * same time that we process the pending unlink requests. If the
236 * DROP DATABASE deletes the file before we do, we will get ENOENT
237 * here. rmtree() also has to ignore ENOENT errors, to deal with
238 * the possibility that we delete the file first.
243 errmsg(
"could not remove file \"%s\": %m", path)));
246 /* Mark the list entry as canceled, just in case */
250 * As in ProcessSyncRequests, we don't want to stop absorbing fsync
251 * requests for a long time when there are many deletions to be done.
252 * We can safely call AbsorbSyncRequests() at this point in the loop.
254 if (--absorb_counter <= 0)
262 * If we reached the end of the list, we can just remove the whole list
263 * (remembering to pfree all the PendingUnlinkEntry objects). Otherwise,
264 * we must keep the entries at or after "lc".
275 for (
int i = 0;
i < ntodelete;
i++)
283 * ProcessSyncRequests() -- Process queued fsync requests.
288 static bool sync_in_progress =
false;
294 /* Statistics on sync times */
304 * This is only called during checkpoints, and checkpoints should only
305 * occur in processes that have created a pendingOps.
308 elog(
ERROR,
"cannot sync without a pendingOps table");
311 * If we are in the checkpointer, the sync had better include all fsync
312 * requests that were queued by backends up to this point. The tightest
313 * race condition that could occur is that a buffer that must be written
314 * and fsync'd for the checkpoint could have been dumped by a backend just
315 * before it was visited by BufferSync(). We know the backend will have
316 * queued an fsync request before clearing the buffer's dirtybit, so we
317 * are safe as long as we do an Absorb after completing BufferSync().
322 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
323 * checkpoint), we want to ignore fsync requests that are entered into the
324 * hashtable after this point --- they should be processed next time,
325 * instead. We use sync_cycle_ctr to tell old entries apart from new
326 * ones: new ones will have cycle_ctr equal to the incremented value of
329 * In normal circumstances, all entries present in the table at this point
330 * will have cycle_ctr exactly equal to the current (about to be old)
331 * value of sync_cycle_ctr. However, if we fail partway through the
332 * fsync'ing loop, then older values of cycle_ctr might remain when we
333 * come back here to try again. Repeated checkpoint failures would
334 * eventually wrap the counter around to the point where an old entry
335 * might appear new, causing us to skip it, possibly allowing a checkpoint
336 * to succeed that should not have. To forestall wraparound, any time the
337 * previous ProcessSyncRequests() failed to complete, run through the
338 * table and forcibly set cycle_ctr = sync_cycle_ctr.
340 * Think not to merge this loop with the main loop, as the problem is
341 * exactly that that loop may fail before having visited all the entries.
342 * From a performance point of view it doesn't matter anyway, as this path
343 * will never be taken in a system that's functioning normally.
345 if (sync_in_progress)
347 /* prior try failed, so update any stale cycle_ctr values */
355 /* Advance counter so that new hashtable entries are distinguishable */
358 /* Set flag to detect failure if we don't reach the end of the loop */
359 sync_in_progress =
true;
361 /* Now scan the hashtable for fsync requests to process */
369 * If the entry is new then don't process it this time; it is new.
370 * Note "continue" bypasses the hash-remove call at the bottom of the
376 /* Else assert we haven't missed it */
380 * If fsync is off then we don't have to bother opening the file at
381 * all. (We delay checking until this point so that changing fsync on
382 * the fly behaves sensibly.)
387 * If in checkpointer, we want to absorb pending requests every so
388 * often to prevent overflow of the fsync request queue. It is
389 * unspecified whether newly-added entries will be visited by
390 * hash_seq_search, but we don't care since we don't need to
391 * process them anyway.
393 if (--absorb_counter <= 0)
400 * The fsync table could contain requests to fsync segments that
401 * have been deleted (unlinked) by the time we get to them. Rather
402 * than just hoping an ENOENT (or EACCES on Windows) error can be
403 * ignored, what we do on error is absorb pending requests and
404 * then retry. Since mdunlink() queues a "cancel" message before
405 * actually unlinking, the fsync request is guaranteed to be
406 * marked canceled after the absorb if it really was this case.
407 * DROP DATABASE likewise has to tell us to forget fsync requests
408 * before it starts deletions.
410 for (failures = 0; !entry->
canceled; failures++)
418 /* Success; update statistics about sync timing */
420 sync_diff = sync_end;
425 total_elapsed += elapsed;
429 elog(
DEBUG1,
"checkpoint sync: number=%d file=%s time=%.3f ms",
432 (
double) elapsed / 1000);
434 break;
/* out of retry loop */
438 * It is possible that the relation has been dropped or
439 * truncated since the fsync request was entered. Therefore,
440 * allow ENOENT, but only if we didn't fail already on this
446 errmsg(
"could not fsync file \"%s\": %m",
455 * Absorb incoming requests and check to see if a cancel
456 * arrived for this relation fork.
460 }
/* end retry loop */
463 /* We are done with this entry, remove it */
466 }
/* end loop over hashtable entries */
468 /* Return sync performance metrics for report at checkpoint end */
473 /* Flag successful completion of ProcessSyncRequests */
474 sync_in_progress =
false;
478 * RememberSyncRequest() -- callback from checkpointer side of sync request
480 * We stuff fsync requests into the local hash table for execution
481 * during the checkpointer's next checkpoint. UNLINK requests go into a
482 * separate linked list, however, because they get processed separately.
484 * See sync.h for more information on the types of sync requests supported.
495 /* Cancel previously entered request */
509 /* Cancel matching fsync requests */
518 /* Cancel matching unlink requests */
530 /* Unlink request: put it in the linked list */
545 /* Normal case: enter a request to fsync this segment */
556 /* if new entry, or was previously canceled, initialize it */
564 * NB: it's intentional that we don't change cycle_ctr if the entry
565 * already exists. The cycle_ctr must represent the oldest fsync
566 * request that could be in the entry.
574 * Register the sync request locally, or forward it to the checkpointer.
576 * If retryOnError is true, we'll keep trying if there is no space in the
577 * queue. Return true if we succeeded, or false if there wasn't space.
587 /* standalone backend or startup process: fsync state is local */
595 * Notify the checkpointer about it. If we fail to queue a message in
596 * retryOnError mode, we have to sleep and try again ... ugly, but
597 * hopefully won't happen often.
599 * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
600 * error in the case of SYNC_UNLINK_REQUEST would leave the
601 * no-longer-used file still present on disk, which would be bad, so
602 * I'm inclined to assume that the checkpointer will always empty the
608 * If we are successful in queueing the request, or we failed and were
609 * instructed not to retry on error, break.
611 if (ret || (!ret && !retryOnError))
615 WAIT_EVENT_REGISTER_SYNC_REQUEST);
bool ForwardSyncRequest(const FileTag *ftag, SyncRequestType type)
void AbsorbSyncRequests(void)
int clogsyncfiletag(const FileTag *ftag, char *path)
int committssyncfiletag(const FileTag *ftag, char *path)
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
void * hash_seq_search(HASH_SEQ_STATUS *status)
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
int errmsg_internal(const char *fmt,...)
int errcode_for_file_access(void)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
int data_sync_elevel(int elevel)
#define FILE_POSSIBLY_DELETED(err)
Assert(PointerIsAligned(start, uint64))
#define INSTR_TIME_SET_CURRENT(t)
#define INSTR_TIME_SUBTRACT(x, y)
#define INSTR_TIME_GET_MICROSEC(t)
int WaitLatch(Latch *latch, int wakeEvents, long timeout, uint32 wait_event_info)
List * lappend(List *list, void *datum)
List * list_delete_first_n(List *list, int n)
void list_free_deep(List *list)
void pfree(void *pointer)
MemoryContext TopMemoryContext
void MemoryContextAllowInCriticalSection(MemoryContext context, bool allow)
bool mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
int mdunlinkfiletag(const FileTag *ftag, char *path)
int mdsyncfiletag(const FileTag *ftag, char *path)
#define AllocSetContextCreate
#define ALLOCSET_DEFAULT_SIZES
#define AmCheckpointerProcess()
int multixactoffsetssyncfiletag(const FileTag *ftag, char *path)
int multixactmemberssyncfiletag(const FileTag *ftag, char *path)
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
static void * list_nth(const List *list, int n)
static int list_cell_number(const List *l, const ListCell *c)
static chr * longest(struct vars *v, struct dfa *d, chr *start, chr *stop, int *hitstopp)
uint64 ckpt_agg_sync_time
int(* sync_syncfiletag)(const FileTag *ftag, char *path)
bool(* sync_filetagmatches)(const FileTag *ftag, const FileTag *candidate)
int(* sync_unlinkfiletag)(const FileTag *ftag, char *path)
void ProcessSyncRequests(void)
static CycleCtr checkpoint_cycle_ctr
void SyncPreCheckpoint(void)
static List * pendingUnlinks
#define UNLINKS_PER_ABSORB
static const SyncOps syncsw[]
static MemoryContext pendingOpsCxt
void RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
static CycleCtr sync_cycle_ctr
#define FSYNCS_PER_ABSORB
void SyncPostCheckpoint(void)
bool RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, bool retryOnError)
@ SYNC_HANDLER_MULTIXACT_MEMBER
@ SYNC_HANDLER_MULTIXACT_OFFSET
#define WL_EXIT_ON_PM_DEATH
CheckpointStatsData CheckpointStats