1/*-------------------------------------------------------------------------
6 * For documentation about how AIO works on a higher level, including a
7 * schematic example, see README.md.
10 * AIO is a complicated subsystem. To keep things navigable, it is split
11 * across a number of files:
13 * - method_*.c - different ways of executing AIO (e.g. worker process)
15 * - aio_target.c - IO on different kinds of targets
17 * - aio_io.c - method-independent code for specific IO ops (e.g. readv)
19 * - aio_callback.c - callbacks at IO operation lifecycle events
21 * - aio_init.c - per-server and per-backend initialization
23 * - aio.c - all other topics
25 * - read_stream.c - helper for reading buffered relation data
27 * - README.md - higher-level overview over AIO
30 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
31 * Portions Copyright (c) 1994, Regents of the University of California
34 * src/backend/storage/aio/aio.c
36 *-------------------------------------------------------------------------
51#include "utils/wait_event_types.h"
63/* Options for io_method. */
67#ifdef IOMETHOD_IO_URING_ENABLED
68 {
"io_uring", IOMETHOD_IO_URING,
false},
77/* global control for AIO */
80/* current backend's per-backend state */
87#ifdef IOMETHOD_IO_URING_ENABLED
88 [IOMETHOD_IO_URING] = &pgaio_uring_ops,
92/* callbacks for the configured io_method, set by assign_io_method */
96/* --------------------------------------------------------------------------------
97 * Public Functions related to PgAioHandle
98 * --------------------------------------------------------------------------------
102 * Acquire an AioHandle, waiting for IO completion if necessary.
104 * Each backend can only have one AIO handle that has been "handed out" to
105 * code, but not yet submitted or released. This restriction is necessary to
106 * ensure that it is possible for code to wait for an unused handle by waiting
107 * for in-flight IO to complete. There is a limited number of handles in each
108 * backend, if multiple handles could be handed out without being submitted,
109 * waiting for all in-flight IO to complete would not guarantee that handles
112 * It is cheap to acquire an IO handle, unless all handles are in use. In that
113 * case this function waits for the oldest IO to complete. If that is not
114 * desirable, use pgaio_io_acquire_nb().
116 * If a handle was acquired but then does not turn out to be needed,
117 * e.g. because pgaio_io_acquire() is called before starting an IO in a
118 * critical section, the handle needs to be released with pgaio_io_release().
121 * To react to the completion of the IO as soon as it is known to have
122 * completed, callbacks can be registered with pgaio_io_register_callbacks().
124 * To actually execute IO using the returned handle, the pgaio_io_start_*()
125 * family of functions is used. In many cases the pgaio_io_start_*() call will
126 * not be done directly by code that acquired the handle, but by lower level
127 * code that gets passed the handle. E.g. if code in bufmgr.c wants to perform
128 * AIO, it typically will pass the handle to smgr.c, which will pass it on to
129 * md.c, on to fd.c, which then finally calls pgaio_io_start_*(). This
130 * forwarding allows the various layers to react to the IO's completion by
131 * registering callbacks. These callbacks in turn can translate a lower
132 * layer's result into a result understandable by a higher layer.
134 * During pgaio_io_start_*() the IO is staged (i.e. prepared for execution but
135 * not submitted to the kernel). Unless in batchmode
136 * (c.f. pgaio_enter_batchmode()), the IO will also get submitted for
137 * execution. Note that, whether in batchmode or not, the IO might even
138 * complete before the functions return.
140 * After pgaio_io_start_*() the AioHandle is "consumed" and may not be
141 * referenced by the IO issuing code. To e.g. wait for IO, references to the
142 * IO can be established with pgaio_io_get_wref() *before* pgaio_io_start_*()
143 * is called. pgaio_wref_wait() can be used to wait for the IO to complete.
146 * To know if the IO [partially] succeeded or failed, a PgAioReturn * can be
147 * passed to pgaio_io_acquire(). Once the issuing backend has called
148 * pgaio_wref_wait(), the PgAioReturn contains information about whether the
149 * operation succeeded and details about the first failure, if any. The error
150 * can be raised / logged with pgaio_result_report().
152 * The lifetime of the memory pointed to be *ret needs to be at least as long
153 * as the passed in resowner. If the resowner releases resources before the IO
154 * completes (typically due to an error), the reference to *ret will be
155 * cleared. In case of resowner cleanup *ret will not be updated with the
156 * results of the IO operation.
171 * Evidently all handles by this backend are in use. Just wait for
179 * Acquire an AioHandle, returning NULL if no handles are free.
181 * See pgaio_io_acquire(). The only difference is that this function will return
182 * NULL if there are no idle handles, instead of blocking.
196 elog(
ERROR,
"API violation: Only one IO can be handed out");
199 * Probably not needed today, as interrupts should not process this IO,
232 * Release IO handle that turned out to not be required.
234 * See pgaio_io_acquire() for more details.
247 * Note that no interrupts are processed between the handed_out_io
248 * check and the call to reclaim - that's important as otherwise an
249 * interrupt could have already reclaimed the handle.
255 elog(
ERROR,
"release in unexpected state");
260 * Release IO handle during resource owner cleanup.
270 * Otherwise an interrupt, in the middle of releasing the IO, could end up
271 * trying to wait for the IO, leading to state confusion.
305 /* this is expected to happen */
310 * Need to unregister the reporting of the IO's result, the memory it's
311 * referencing likely has gone away.
320 * Add a [set of] flags to the IO.
322 * Note that this combines flags with already set flags, rather than set flags
323 * to explicitly the passed in parameters. This is to allow multiple callsites
335 * Returns an ID uniquely identifying the IO handle. This is only really
336 * useful for logging, as handles are reused across multiple IOs.
347 * Return the ProcNumber for the process that can use an IO handle. The
348 * mapping from IO handles to PGPROCs is static, therefore this even works
349 * when the corresponding PGPROC is not in use.
358 * Return a wait reference for the IO. Only wait references can be used to
359 * wait for an IOs completion, as handles themselves can be reused after
360 * completion. See also the comment above pgaio_io_acquire().
377/* --------------------------------------------------------------------------------
378 * Internal Functions related to PgAioHandle
379 * --------------------------------------------------------------------------------
386 * All callers need to have held interrupts in some form, otherwise
387 * interrupt processing could wait for the IO to complete, while in an
388 * intermediary state.
393 "updating state to %s",
397 * Ensure the changes signified by the new state are visible before the
398 * new state becomes visible.
402 ioh->
state = new_state;
416 * Stage IO for execution and, if appropriate, submit it immediately.
418 * Should only be called from pgaio_io_start_*().
423 bool needs_synchronous;
430 * Otherwise an interrupt, in the middle of staging and possibly executing
431 * the IO, could end up trying to wait for the IO, leading to state
441 /* allow a new IO to be staged */
449 * Synchronous execution has to be executed, well, synchronously, so check
455 "staged (synchronous: %d, in_batch: %d)",
458 if (!needs_synchronous)
464 * Unless code explicitly opted into batching IOs, submit the IO
483 * If the caller said to execute the IO synchronously, do so.
485 * XXX: We could optimize the logic when to execute synchronously by first
486 * checking if there are other IOs in flight and only synchronously
487 * executing if not. Unclear whether that'll be sufficiently common to be
488 * worth worrying about.
493 /* Check if the IO method requires synchronous execution of IO */
501 * Handle IO being processed by IO method.
503 * Should be called by IO methods / synchronous IO execution, just before the
515 * Handle IO getting completed by a method.
517 * Should be called by IO methods / synchronous IO execution, just after the
518 * IO has been performed.
520 * Expects to be called in a critical section. We expect IOs to be usable for
521 * WAL etc, which requires being able to execute completion callbacks in a
541 /* condition variable broadcast ensures state is visible before wakeup */
544 /* contains call to pgaio_io_call_complete_local() */
550 * Has the IO completed and thus the IO handle been reused?
552 * This is useful when waiting for IO completion at a low level (e.g. in an IO
553 * method's ->wait_one() callback).
561 * Ensure that we don't see an earlier state of the handle than ioh->state
562 * due to compiler or CPU reordering. This protects both ->generation as
563 * directly used here, and other fields in the handle accessed in the
564 * caller if the handle was not reused.
572 * Wait for IO to complete. External code should never use this, outside of
573 * the AIO subsystem waits are only allowed via pgaio_wref_wait().
593 elog(
PANIC,
"waiting for own IO %d in wrong state: %s",
613 * If we need to wait via the IO method, do so now. Don't
614 * check via the IO method if the issuing backend is executing
615 * the IO synchronously.
624 /* waiting for owner to submit */
627 /* waiting for reaper to complete */
630 /* shouldn't be able to hit this otherwise */
632 /* ensure we're going to get woken up */
650 * Note that no interrupts are processed between
651 * pgaio_io_was_recycled() and this check - that's important
652 * as otherwise an interrupt could have already reclaimed the
663 * Make IO handle ready to be reused after IO has completed or after the
664 * handle has been released without being used.
666 * Note that callers need to be careful about only calling this in the right
667 * state and that no interrupts can be processed between the state check and
668 * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
669 * already have reclaimed the handle.
674 /* This is only ok if it's our IO */
678 /* see comment in function header */
682 * It's a bit ugly, but right now the easiest place to put the execution
683 * of local completion callbacks is this function, as we need to execute
684 * local callbacks just before reclaiming at multiple callsites.
701 "reclaiming: distilled_result: (status %s, id %u, error_data %d), raw_result: %d",
707 /* if the IO has been defined, it's on the in-flight list, remove */
720 * Update generation & state first, before resetting the IO's fields,
721 * otherwise a concurrent "viewer" could think the fields are valid, even
722 * though they are being reset. Increment the generation first, so that
723 * we can assert elsewhere that we never wait for an IDLE IO. While it's
724 * a bit weird for the state to go backwards for a generation, it's OK
725 * here, as there cannot be references to the "reborn" IO yet. Can't
726 * update both at once, so something has to give.
731 /* ensure the state update is visible before we reset fields */
744 * We push the IO to the head of the idle IO list, that seems more cache
745 * efficient in cases where only a few IOs are used.
753 * Wait for an IO handle to become usable.
755 * This only really is useful for pgaio_io_acquire().
762 pgaio_debug(
DEBUG2,
"waiting for free IO with %d pending, %u in-flight, %u idle IOs",
768 * First check if any of our IOs actually have completed - when using
769 * worker, that'll often be the case. We could do so as part of the loop
770 * below, but that'd potentially lead us to wait for some IO submitted
780 * Note that no interrupts are processed between the state check
781 * and the call to reclaim - that's important as otherwise an
782 * interrupt could have already reclaimed the handle.
784 * Need to ensure that there's no reordering, in the more common
785 * paths, where we wait for IO, that's done by
786 * pgaio_io_was_recycled().
798 * If we have any unsubmitted IOs, submit them now. We'll start waiting in
799 * a second, so it's better they're in flight. This also addresses the
800 * edge-case that all IOs are unsubmitted.
805 /* possibly some IOs finished during submission */
818 * Wait for the oldest in-flight IO to complete.
820 * XXX: Reusing the general IO wait is suboptimal, we don't need to wait
821 * for that specific IO to complete, we just need *any* IO to complete.
830 /* should not be in in-flight list */
836 elog(
ERROR,
"shouldn't get here with io:%d in state %d",
843 "waiting for free io with %u in flight",
847 * In a more general case this would be racy, because the
848 * generation could increase after we read ioh->state above.
849 * But we are only looking at IOs by the current backend and
850 * the IO can only be recycled by this backend. Even this is
851 * only OK because we get the handle's generation before
852 * potentially processing interrupts, e.g. as part of
861 * It's possible that another backend just finished this IO.
863 * Note that no interrupts are processed between the state
864 * check and the call to reclaim - that's important as
865 * otherwise an interrupt could have already reclaimed the
868 * Need to ensure that there's no reordering, in the more
869 * common paths, where we wait for IO, that's done by
870 * pgaio_io_was_recycled().
878 elog(
PANIC,
"no idle IO after waiting for IO to terminate");
884 * Internal - code outside of AIO should never need this and it'd be hard for
885 * such code to be safe.
899 Assert(*ref_generation != 0);
907#define PGAIO_HS_TOSTR_CASE(sym) case PGAIO_HS_##sym: return #sym
919#undef PGAIO_HS_TOSTR_CASE
921 return NULL;
/* silence compiler */
947 return NULL;
/* silence compiler */
952/* --------------------------------------------------------------------------------
953 * Functions primarily related to IO Wait References
954 * --------------------------------------------------------------------------------
958 * Mark a wait reference as invalid
966/* Is the wait reference valid? */
974 * Similar to pgaio_io_get_id(), just for wait references.
984 * Wait for the IO to have completed. Can be called in any process, not just
985 * in the issuing backend.
999 * Check if the referenced IO completed, without blocking.
1023 * Note that no interrupts are processed between
1024 * pgaio_io_was_recycled() and this check - that's important as
1025 * otherwise an interrupt could have already reclaimed the handle.
1033 * XXX: It likely would be worth checking in with the io method, to give
1034 * the IO method a chance to check if there are completion events queued.
1042/* --------------------------------------------------------------------------------
1043 * Actions on multiple IOs.
1044 * --------------------------------------------------------------------------------
1048 * Submit IOs in batches going forward.
1050 * Submitting multiple IOs at once can be substantially faster than doing so
1051 * one-by-one. At the same time, submitting multiple IOs at once requires more
1052 * care to avoid deadlocks.
1054 * Consider backend A staging an IO for buffer 1 and then trying to start IO
1055 * on buffer 2, while backend B does the inverse. If A submitted the IO before
1056 * moving on to buffer 2, this works just fine, B will wait for the IO to
1057 * complete. But if batching were used, each backend will wait for IO that has
1058 * not yet been submitted to complete, i.e. forever.
1060 * End batch submission mode with pgaio_exit_batchmode(). (Throwing errors is
1061 * allowed; error recovery will end the batch.)
1063 * To avoid deadlocks, code needs to ensure that it will not wait for another
1064 * backend while there is unsubmitted IO. E.g. by using conditional lock
1065 * acquisition when acquiring buffer locks. To check if there currently are
1066 * staged IOs, call pgaio_have_staged() and to submit all staged IOs call
1067 * pgaio_submit_staged().
1069 * It is not allowed to enter batchmode while already in batchmode, it's
1070 * unlikely to ever be needed, as code needs to be explicitly aware of being
1071 * called in batchmode, to avoid the deadlock risks explained above.
1073 * Note that IOs may get submitted before pgaio_exit_batchmode() is called,
1074 * e.g. because too many IOs have been staged or because pgaio_submit_staged()
1081 elog(
ERROR,
"starting batch while batch already in progress");
1086 * Stop submitting IOs in batches.
1098 * Are there staged but unsubmitted IOs?
1100 * See comment above pgaio_enter_batchmode() for why code may need to check if
1101 * there is IO in that state.
1112 * Submit all staged but not yet submitted IOs.
1114 * Unless in batch mode, this never needs to be called, as IOs get submitted
1115 * as soon as possible. While in batchmode pgaio_submit_staged() can be called
1116 * before waiting on another backend, to avoid the risk of deadlocks. See
1117 * pgaio_enter_batchmode().
1122 int total_submitted = 0;
1136 total_submitted += did_submit;
1138 Assert(total_submitted == did_submit);
1143 "aio: submitted %d IOs",
1149/* --------------------------------------------------------------------------------
1151 * --------------------------------------------------------------------------------
1156 * Perform AIO related cleanup after an error.
1158 * This should be called early in the error recovery paths, as later steps may
1159 * need to issue AIO (e.g. to record a transaction abort WAL record).
1165 * It is possible that code errored out after pgaio_enter_batchmode() but
1166 * before pgaio_exit_batchmode() was called. In that case we need to
1167 * submit the IO now.
1177 * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1183 * Perform AIO related checks at (sub-)transactional boundaries.
1185 * This should be called late during (sub-)transactional commit/abort, after
1186 * all steps that might need to perform AIO, so that we can verify that the
1187 * AIO subsystem is in a valid state at the end of a transaction.
1193 * We should never be in batch mode at transactional boundaries. In case
1194 * an error was thrown while in batch mode, pgaio_error_cleanup() should
1195 * have exited batchmode.
1197 * In case we are in batchmode somehow, make sure to submit all staged
1198 * IOs, other backends may need them to complete to continue.
1203 elog(
WARNING,
"open AIO batch at end of (sub-)transaction");
1207 * As we aren't in batchmode, there shouldn't be any unsubmitted IOs.
1213 * Need to submit staged but not yet submitted IOs using the fd, otherwise
1214 * the IO would end up targeting something bogus.
1220 * Might be called before AIO is initialized or in a subprocess that
1227 * For now just submit all staged IOs - we could be more selective, but
1228 * it's probably not worth it.
1233 "submitting %d IOs before FD %d gets closed",
1239 * If requested by the IO method, wait for all IOs that use the
1245 * As waiting for one IO to complete may complete multiple IOs, we
1246 * can't just use a mutable list iterator. The maximum number of
1247 * in-flight IOs is fairly small, so just restart the loop after
1248 * waiting for an IO.
1272 "waiting for IO before FD %d gets closed, %u in-flight IOs",
1275 /* see comment in pgaio_io_wait_for_free() about raciness */
1282 * Registered as before_shmem_exit() callback in pgaio_init_backend()
1290 /* first clean up resources as we would at a transaction boundary */
1294 * Before exiting, make sure that all IOs are finished. That has two main
1297 * - Some kernel-level AIO mechanisms don't deal well with the issuer of
1298 * an AIO exiting before IO completed
1300 * - It'd be confusing to see partially finished IOs in stats views etc
1308 "waiting for IO to complete during shutdown, %u in-flight IOs",
1311 /* see comment in pgaio_io_wait_for_free() about raciness */
1333 * Auto-tuning will be applied later during startup, as auto-tuning
1334 * depends on the value of various GUCs.
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
bool pgaio_wref_valid(PgAioWaitRef *iow)
int pgaio_io_get_id(PgAioHandle *ioh)
PgAioBackend * pgaio_my_backend
const char * pgaio_result_status_string(PgAioResultStatus rs)
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
void assign_io_method(int newval, void *extra)
static void pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
void pgaio_wref_clear(PgAioWaitRef *iow)
bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
static void pgaio_io_wait_for_free(void)
#define PGAIO_HS_TOSTR_CASE(sym)
static const char * pgaio_io_state_get_name(PgAioHandleState s)
void pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
static void pgaio_io_resowner_register(PgAioHandle *ioh)
static PgAioHandle * pgaio_io_from_wref(PgAioWaitRef *iow, uint64 *ref_generation)
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
void pgaio_closing_fd(int fd)
void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
bool pgaio_have_staged(void)
const IoMethodOps * pgaio_method_ops
bool pgaio_wref_check_done(PgAioWaitRef *iow)
static const IoMethodOps *const pgaio_method_ops_table[]
static void pgaio_io_reclaim(PgAioHandle *ioh)
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
void pgaio_enter_batchmode(void)
void pgaio_submit_staged(void)
const char * pgaio_io_get_state_name(PgAioHandle *ioh)
const struct config_enum_entry io_method_options[]
bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
void pgaio_io_prepare_submit(PgAioHandle *ioh)
void pgaio_wref_wait(PgAioWaitRef *iow)
void pgaio_error_cleanup(void)
void pgaio_io_release(PgAioHandle *ioh)
int pgaio_wref_get_id(PgAioWaitRef *iow)
void AtEOXact_Aio(bool is_commit)
void pgaio_shutdown(int code, Datum arg)
bool check_io_max_concurrency(int *newval, void **extra, GucSource source)
static void pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
void pgaio_exit_batchmode(void)
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
#define DEFAULT_IO_METHOD
void pgaio_io_call_stage(PgAioHandle *ioh)
PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh)
void pgaio_io_call_complete_shared(PgAioHandle *ioh)
@ PGAIO_HS_COMPLETED_SHARED
@ PGAIO_HS_COMPLETED_LOCAL
#define pgaio_debug(elevel, msg,...)
#define pgaio_debug_io(elevel, ioh, msg,...)
#define PGAIO_SUBMIT_BATCH_SIZE
void pgaio_io_perform_synchronously(PgAioHandle *ioh)
bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd)
bool pgaio_io_has_target(PgAioHandle *ioh)
#define pg_read_barrier()
#define pg_write_barrier()
bool ConditionVariableCancelSleep(void)
void ConditionVariableBroadcast(ConditionVariable *cv)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
int errmsg_internal(const char *fmt,...)
int errdetail_internal(const char *fmt,...)
#define ereport(elevel,...)
volatile uint32 CritSectionCount
#define GUC_check_errdetail
Assert(PointerIsAligned(start, uint64))
#define dclist_container(type, membername, ptr)
#define dclist_head_element(type, membername, lhead)
static void dclist_push_tail(dclist_head *head, dlist_node *node)
static uint32 dclist_count(const dclist_head *head)
static bool dclist_is_empty(const dclist_head *head)
static void dclist_delete_from(dclist_head *head, dlist_node *node)
static dlist_node * dclist_pop_head_node(dclist_head *head)
static void dclist_push_head(dclist_head *head, dlist_node *node)
#define dlist_container(type, membername, ptr)
#define dclist_foreach(iter, lhead)
#define INJECTION_POINT(name, arg)
const IoMethodOps pgaio_sync_ops
const IoMethodOps pgaio_worker_ops
#define RESUME_INTERRUPTS()
#define INTERRUPTS_CAN_BE_PROCESSED()
#define START_CRIT_SECTION()
#define HOLD_INTERRUPTS()
#define END_CRIT_SECTION()
static rewind_source * source
static int fd(const char *x, int i)
ResourceOwner CurrentResourceOwner
void ResourceOwnerRememberAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
void ResourceOwnerForgetAioHandle(ResourceOwner owner, struct dlist_node *ioh_node)
bool wait_on_fd_before_close
int(* submit)(uint16 num_staged_ios, PgAioHandle **staged_ios)
void(* wait_one)(PgAioHandle *ioh, uint64 ref_generation)
bool(* needs_synchronous_execution)(PgAioHandle *ioh)
dclist_head in_flight_ios
PgAioHandle * staged_ios[PGAIO_SUBMIT_BATCH_SIZE]
PgAioHandle * handed_out_io
PgAioTargetData target_data
struct ResourceOwnerData * resowner
PgAioResult distilled_result
PgAioReturn * report_return
PgAioTargetData target_data