1/*-------------------------------------------------------------------------
4 * AIO related declarations that should only be used by the AIO subsystem
8 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/include/storage/aio_internal.h
13 *-------------------------------------------------------------------------
26 * The maximum number of IOs that can be batch submitted at once.
28 #define PGAIO_SUBMIT_BATCH_SIZE 32
33 * State machine for handles. With some exceptions, noted below, handles move
34 * linearly through all states.
36 * State changes should all go through pgaio_io_update_state().
38 * Note that the externally visible functions to start IO
39 * (e.g. FileStartReadV(), via pgaio_io_start_readv()) move an IO from
40 * PGAIO_HS_HANDED_OUT to at least PGAIO_HS_STAGED and at most
41 * PGAIO_HS_COMPLETED_LOCAL (at which point the handle will be reused).
49 * Returned by pgaio_io_acquire(). The next state is either DEFINED (if
50 * pgaio_io_start_*() is called), or IDLE (if pgaio_io_release() is
56 * pgaio_io_start_*() has been called, but IO is not yet staged. At this
57 * point the handle has all the information for the IO to be executed.
62 * stage() callbacks have been called, handle ready to be submitted for
63 * execution. Unless in batchmode (see c.f. pgaio_enter_batchmode()), the
64 * IO will be submitted immediately after.
68 /* IO has been submitted to the IO method for execution */
71 /* IO finished, but result has not yet been processed */
75 * IO completed, shared completion has been called.
77 * If the IO completion occurs in the issuing backend, local callbacks
78 * will immediately be called. Otherwise the handle stays in
79 * COMPLETED_SHARED until the issuing backend waits for the completion of
85 * IO completed, local completion has been called.
87 * After this the handle will be made reusable and go into IDLE state.
96 * Typedef is in aio_types.h
98 * We don't use the underlying enums for state, target and op to avoid wasting
99 * space. We tried using bitfields, but several compilers generate rather
100 * horrid code for that.
104 /* all state updates should go through pgaio_io_update_state() */
107 /* what are we operating on */
110 /* which IO operation */
113 /* bitfield of PgAioHandleFlags */
118 /* using the proper type here would use more space */
121 /* data forwarded to each callback */
125 * Length of data associated with handle using
126 * pgaio_io_set_handle_data_*().
130 /* XXX: could be optimized out with some pointer math */
133 /* raw result of the IO operation */
151 /* incremented every time the IO handle is reused */
155 * To wait for the IO to complete other backends can wait on this CV. Note
156 * that, if in SUBMITTED state, a waiter first needs to check if it needs
157 * to do work via IoMethodOps->wait_one().
161 /* result of shared callback, passed to issuer callback */
165 * Index into PgAioCtl->iovecs and PgAioCtl->handle_data.
167 * At the moment there's no need to differentiate between the two, but
168 * that won't necessarily stay that way.
173 * If not NULL, this memory location will be updated with information
174 * about the IOs completion iff the issuing backend learns about the IOs
179 /* Data necessary for the IO to be performed */
183 * Data necessary to identify the object undergoing IO to higher-level
184 * code. Needs to be sufficient to allow another backend to reopen the
193 /* index into PgAioCtl->io_handles */
196 /* IO Handles that currently are not used */
200 * Only one IO may be returned by pgaio_io_acquire()/pgaio_io_acquire_nb()
201 * without having been either defined (by actually associating it with IO)
202 * or released (with pgaio_io_release()). This restriction is necessary to
203 * guarantee that we always can acquire an IO. ->handed_out_io is used to
208 /* Are we currently in batchmode? See pgaio_enter_batchmode(). */
212 * IOs that are defined, but not yet submitted.
218 * List of in-flight IOs. Also contains IOs that aren't strictly speaking
219 * in-flight anymore, but have been waited-for and completed by another
220 * backend. Once this backend sees such an IO it'll be reclaimed.
222 * The list is ordered by submission time, with more recently submitted
223 * IOs being appended at the end.
235 * Array of iovec structs. Each iovec is owned by a specific backend. The
236 * allocation is in PgAioCtl to allow the maximum number of iovecs for
237 * individual IOs to be configurable with PGC_POSTMASTER GUC.
243 * For, e.g., an IO covering multiple buffers in shared / temp buffers, we
244 * need to get Buffer IDs during completion to be able to change the
245 * BufferDesc state accordingly. This space can be used to store e.g.
246 * Buffer IDs. Note that the actual iovec might be shorter than this,
247 * because we combine neighboring pages into one larger iovec entry.
258 * Callbacks used to implement an IO method.
265 * If an FD is about to be closed, do we need to wait for all in-flight
266 * IOs referencing that FD?
271 /* global initialization */
274 * Amount of additional shared memory to reserve for the io_method. Called
275 * just like a normal ipci.c style *Size() function. Optional.
280 * Initialize shared memory. First time is true if AIO's shared memory was
281 * just initialized, false otherwise. Optional.
286 * Per-backend initialization. Optional.
291 /* handling of IOs */
297 * Start executing passed in IOs.
299 * Shall advance state to at least PGAIO_HS_SUBMITTED. (By the time this
300 * returns, other backends might have advanced the state further.)
302 * Will not be called if ->needs_synchronous_execution() returned true.
304 * num_staged_ios is <= PGAIO_SUBMIT_BATCH_SIZE.
306 * Always called in a critical section.
311 * Wait for the IO to complete. Optional.
313 * On return, state shall be on of
314 * - PGAIO_HS_COMPLETED_IO
315 * - PGAIO_HS_COMPLETED_SHARED
316 * - PGAIO_HS_COMPLETED_LOCAL
318 * The callback must not block if the handle is already in one of those
319 * states, or has been reused (see pgaio_io_was_recycled()). If, on
320 * return, the state is PGAIO_HS_COMPLETED_IO, state will reach
321 * PGAIO_HS_COMPLETED_SHARED without further intervention by the IO
324 * If not provided, it needs to be guaranteed that the IO method calls
325 * pgaio_io_process_completion() without further interaction by the
362 * The AIO subsystem has fairly verbose debug logging support. This can be
363 * enabled/disabled at build time. The reason for this is that
364 * a) the verbosity can make debugging things on higher levels hard
365 * b) even if logging can be skipped due to elevel checks, it still causes a
366 * measurable slowdown
368 * XXX: This likely should be eventually be disabled by default, at least in
371 #define PGAIO_VERBOSE 1
374 * Simple ereport() wrapper that only logs if PGAIO_VERBOSE is defined.
376 * This intentionally still compiles the code, guarded by a constant if (0),
377 * if verbose logging is disabled, to make it less likely that debug logging
378 * is silently broken.
380 * The current definition requires passing at least one argument.
382 #define pgaio_debug(elevel, msg, ...) \
386 errhidestmt(true), errhidecontext(true), \
387 errmsg_internal(msg, \
392 * Simple ereport() wrapper. Note that the definition requires passing at
393 * least one argument.
395 #define pgaio_debug_io(elevel, ioh, msg, ...) \
396 pgaio_debug(elevel, "io %-10d|op %-5s|target %-4s|state %-16s: " msg, \
397 pgaio_io_get_id(ioh), \
398 pgaio_io_get_op_name(ioh), \
399 pgaio_io_get_target_name(ioh), \
400 pgaio_io_get_state_name(ioh), \
403/* Declarations for the tables of function pointers exposed by each IO method. */
406#ifdef IOMETHOD_IO_URING_ENABLED
416#endif /* AIO_INTERNAL_H */
#define PGAIO_HANDLE_MAX_CALLBACKS
void pgaio_io_process_completion(PgAioHandle *ioh, int result)
void pgaio_io_perform_synchronously(PgAioHandle *ioh)
struct IoMethodOps IoMethodOps
const char * pgaio_result_status_string(PgAioResultStatus rs)
void pgaio_io_call_stage(PgAioHandle *ioh)
PGDLLIMPORT const IoMethodOps pgaio_worker_ops
@ PGAIO_HS_COMPLETED_SHARED
@ PGAIO_HS_COMPLETED_LOCAL
bool pgaio_io_needs_synchronous_execution(PgAioHandle *ioh)
const char * pgaio_io_get_op_name(PgAioHandle *ioh)
PgAioResult pgaio_io_call_complete_local(PgAioHandle *ioh)
void pgaio_io_reopen(PgAioHandle *ioh)
bool pgaio_io_uses_fd(PgAioHandle *ioh, int fd)
bool pgaio_io_can_reopen(PgAioHandle *ioh)
void pgaio_io_call_complete_shared(PgAioHandle *ioh)
void pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
PGDLLIMPORT PgAioBackend * pgaio_my_backend
struct PgAioBackend PgAioBackend
int pgaio_io_get_iovec_length(PgAioHandle *ioh, struct iovec **iov)
PGDLLIMPORT PgAioCtl * pgaio_ctl
PGDLLIMPORT const IoMethodOps pgaio_sync_ops
PGDLLIMPORT const IoMethodOps * pgaio_method_ops
const char * pgaio_io_get_target_name(PgAioHandle *ioh)
const char * pgaio_io_get_state_name(PgAioHandle *ioh)
bool pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState *state)
void pgaio_io_prepare_submit(PgAioHandle *ioh)
void pgaio_shutdown(int code, Datum arg)
#define PGAIO_SUBMIT_BATCH_SIZE
static int fd(const char *x, int i)
size_t(* shmem_size)(void)
bool wait_on_fd_before_close
void(* shmem_init)(bool first_time)
void(* init_backend)(void)
int(* submit)(uint16 num_staged_ios, PgAioHandle **staged_ios)
void(* wait_one)(PgAioHandle *ioh, uint64 ref_generation)
bool(* needs_synchronous_execution)(PgAioHandle *ioh)
dclist_head in_flight_ios
PgAioHandle * staged_ios[PGAIO_SUBMIT_BATCH_SIZE]
PgAioHandle * handed_out_io
PgAioBackend * backend_state
PgAioTargetData target_data
struct ResourceOwnerData * resowner
PgAioResult distilled_result
uint8 callbacks[PGAIO_HANDLE_MAX_CALLBACKS]
PgAioReturn * report_return
uint8 callbacks_data[PGAIO_HANDLE_MAX_CALLBACKS]