1/*-------------------------------------------------------------------------
4 * Mechanism for accessing buffered relation data with look-ahead
6 * Code that needs to access relation data typically pins blocks one at a
7 * time, often in a predictable order that might be sequential or data-driven.
8 * Calling the simple ReadBuffer() function for each block is inefficient,
9 * because blocks that are not yet in the buffer pool require I/O operations
10 * that are small and might stall waiting for storage. This mechanism looks
11 * into the future and calls StartReadBuffers() and WaitReadBuffers() to read
12 * neighboring blocks together and ahead of time, with an adaptive look-ahead
15 * A user-provided callback generates a stream of block numbers that is used
16 * to form reads of up to io_combine_limit, by attempting to merge them with a
17 * pending read. When that isn't possible, the existing pending read is sent
18 * to StartReadBuffers() so that a new one can begin to form.
20 * The algorithm for controlling the look-ahead distance is based on recent
21 * cache hit and miss history. When no I/O is necessary, there is no benefit
22 * in looking ahead more than one block. This is the default initial
23 * assumption, but when blocks needing I/O are streamed, the distance is
24 * increased rapidly to try to benefit from I/O combining and concurrency. It
25 * is reduced gradually when cached blocks are streamed.
27 * The main data structure is a circular queue of buffers of size
28 * max_pinned_buffers plus some extra space for technical reasons, ready to be
29 * returned by read_stream_next_buffer(). Each buffer also has an optional
30 * variable sized object that is passed from the callback to the consumer of
33 * Parallel to the queue of buffers, there is a circular queue of in-progress
34 * I/Os that have been started with StartReadBuffers(), and for which
35 * WaitReadBuffers() must be called before returning the buffer.
37 * For example, if the callback returns block numbers 10, 42, 43, 44, 60 in
38 * successive calls, then these data structures might appear as follows:
40 * buffers buf/data ios
42 * +----+ +-----+ +--------+
43 * | | | | +----+ 42..44 | <- oldest_io_index
44 * +----+ +-----+ | +--------+
45 * oldest_buffer_index -> | 10 | | ? | | +--+ 60..60 |
46 * +----+ +-----+ | | +--------+
47 * | 42 | | ? |<-+ | | | <- next_io_index
48 * +----+ +-----+ | +--------+
50 * +----+ +-----+ | +--------+
52 * +----+ +-----+ | +--------+
55 * next_buffer_index -> | | | |
58 * In the example, 5 buffers are pinned, and the next buffer to be streamed to
59 * the client is block 10. Block 10 was a hit and has no associated I/O, but
60 * the range 42..44 requires an I/O wait before its buffers are returned, as
64 * Portions Copyright (c) 2024-2025, PostgreSQL Global Development Group
65 * Portions Copyright (c) 1994, Regents of the University of California
68 * src/backend/storage/aio/read_stream.c
70 *-------------------------------------------------------------------------
90 * State for managing a stream of reads.
110 * One-block buffer to support 'ungetting' a block number, to resolve flow
111 * control problems when I/Os are split.
116 * The callback that will tell us which block numbers to read, and an
117 * opaque pointer that will be pass to it for its own purposes.
122 /* Next expected block, for detecting sequential access. */
126 /* The read operation we are currently preparing. */
130 /* Space for buffers and optional per-buffer private data. */
134 /* Read operations that have been started but not waited for yet. */
141 /* Circular queue of buffers. */
148 * Return a pointer to the per-buffer data by index.
158 * General-use ReadStreamBlockNumberCB for block range scans. Loops over the
159 * blocks [current_blocknum, last_exclusive).
163 void *callback_private_data,
164 void *per_buffer_data)
175 * Ask the callback which block it would like us to read next, with a one block
176 * buffer in front to allow read_stream_unget_block() to work.
189 * Tell Valgrind that the per-buffer data is undefined. That replaces
190 * the "noaccess" state that was set when the consumer moved past this
191 * entry last time around the queue, and should also catch callbacks
192 * that fail to initialize data that the buffer consumer later
193 * accesses. On the first go around, it is undefined already.
206 * In order to deal with buffer shortages and I/O limits after short reads, we
207 * sometimes need to defer handling of a block we've already consumed from the
208 * registered callback until later.
213 /* We shouldn't ever unget more than one block. */
220 * Start as much of the current pending read as we can. If we have to split it
221 * because of the per-backend buffer limit, or the buffer manager decides to
222 * split it, then the pending read is adjusted to hold the remaining portion.
224 * We can always start a read of at least size one if we have no progress yet.
225 * Otherwise it's possible that we can't start a read at all because of a lack
226 * of buffers, and then false is returned. Buffer shortages also reduce the
227 * distance to a level that prevents look-ahead until buffers are released.
233 int requested_nblocks;
242 /* This should only be called with a pending read. */
246 /* We had better not exceed the per-stream buffer limit with this read. */
250#ifdef USE_ASSERT_CHECKING
251 /* We had better not be overwriting an existing pinned buffer. */
258 * Pinned buffers forwarded by a preceding StartReadBuffers() call that
259 * had to split the operation should match the leading blocks of this
260 * following StartReadBuffers() call.
268 * Check that we've cleared the queue/overflow entries corresponding to
269 * the rest of the blocks covered by this read, unless it's the first go
270 * around and we haven't even initialized them yet.
277 /* Do we need to issue read-ahead advice? */
284 * Sequential: Issue advice until the preadv() calls have caught
285 * up with the first advice issued for this sequential region, and
286 * then stay of the way of the kernel's own read-ahead.
294 * Random jump: Note the starting location of a new potential
295 * sequential region and start issuing advice. Skip it this time
296 * if the preadv() follows immediately, eg first block in stream.
305 * How many more buffers is this backend allowed?
307 * Forwarded buffers are already pinned and map to the leading blocks of
308 * the pending read (the remaining portion of an earlier short read that
309 * we're about to continue). They are not counted in pinned_buffers, but
310 * they are counted as pins already held by this backend according to the
311 * buffer manager, so they must be added to the limit it grants us.
323 buffer_limit = 1;
/* guarantee progress */
325 /* Does the per-backend limit affect this read? */
327 if (buffer_limit < nblocks)
331 /* Shrink distance: no more look-ahead until buffers are released. */
333 if (stream->
distance > new_distance)
336 /* Unless we have nothing to give the consumer, stop here. */
340 /* A short read is required to make progress. */
341 nblocks = buffer_limit;
345 * We say how many blocks we want to read, but it may be smaller on return
346 * if the buffer manager decides to shorten the read. Initialize buffers
347 * to InvalidBuffer (= not a forwarded buffer) as input on first use only,
348 * and keep the original nblocks number so we can check for forwarded
349 * buffers as output, below.
355 requested_nblocks = nblocks;
357 &stream->
buffers[buffer_index],
363 /* Remember whether we need to wait before returning this buffer. */
366 /* Look-ahead distance decays, no I/O necessary. */
373 * Remember to call WaitReadBuffers() before returning head buffer.
374 * Look-ahead distance will be adjusted after waiting.
385 * How many pins were acquired but forwarded to the next call? These need
386 * to be passed to the next StartReadBuffers() call by leaving them
387 * exactly where they are in the queue, or released if the stream ends
388 * early. We need the number for accounting purposes, since they are not
389 * counted in stream->pinned_buffers but we already hold them.
392 while (nblocks + forwarded < requested_nblocks &&
398 * We gave a contiguous range of buffer space to StartReadBuffers(), but
399 * we want it to wrap around at queue_size. Copy overflowing buffers to
400 * the front of the array where they'll be consumed, but also leave a copy
401 * in the overflow zone which the I/O operation has a pointer to (it needs
402 * a contiguous array). Both copies will be cleared when the buffers are
403 * handed to the consumer.
405 overflow = (buffer_index + nblocks + forwarded) - stream->
queue_size;
408 Assert(overflow < stream->queue_size);
/* can't overlap */
411 sizeof(stream->
buffers[0]) * overflow);
414 /* Compute location of start of next read, without using % operator. */
415 buffer_index += nblocks;
418 Assert(buffer_index >= 0 && buffer_index < stream->queue_size);
421 /* Adjust the pending read to cover the remaining portion, if any. */
432 * Allow amortizing the cost of submitting IO over multiple IOs. This
433 * requires that we don't do any operations that could lead to a deadlock
434 * with staged-but-unsubmitted IO. The callback needs to opt-in to being
445 void *per_buffer_data;
454 * See which block the callback wants next in the stream. We need to
455 * compute the index of the Nth block of the pending read including
456 * wrap-around, but we don't want to use the expensive % operator.
461 Assert(buffer_index >= 0 && buffer_index < stream->queue_size);
471 /* Can we merge it with the pending read? */
479 /* We have to start the pending read before we can build another. */
485 /* We've hit the buffer or I/O limit. Rewind and stop here. */
493 /* This is the start of a new pending read. */
499 * We don't start the pending read just because we've hit the distance
500 * limit, preferring to give it another chance to grow to full
501 * io_combine_limit size once more buffers have been consumed. However,
502 * if we've already reached io_combine_limit, or we've reached the
503 * distance limit and there isn't anything pinned yet, or the callback has
504 * signaled end-of-stream, we start the read immediately. Note that the
505 * pending read can exceed the distance goal, if the latter was reduced
506 * after hitting the per-backend buffer limit.
517 * There should always be something pinned when we leave this function,
518 * whether started by this call or not, unless we've hit the end of the
519 * stream. In the worst case we can always make progress one buffer at a
529 * Create a new read stream object that can be used to perform the equivalent
530 * of a series of ReadBuffer() calls for one fork of one relation.
531 * Internally, it generates larger vectored reads where possible by looking
532 * ahead. The callback should return block numbers or InvalidBlockNumber to
533 * signal end-of-stream, and if per_buffer_data_size is non-zero, it may also
534 * write extra data for each block into the space provided to it. It will
535 * also receive callback_private_data for its own purposes.
545 void *callback_private_data,
546 size_t per_buffer_data_size)
551 int16 queue_overflow;
553 int strategy_pin_limit;
554 uint32 max_pinned_buffers;
555 uint32 max_possible_buffer_limit;
559 * Decide how many I/Os we will allow to run at the same time. That
560 * currently means advice to the kernel to tell it that we will soon read.
561 * This number also affects how far we look ahead for opportunities to
570 * Avoid circularity while trying to look up tablespace settings or
571 * before spccache.c is ready.
580 /* Cap to INT16_MAX to avoid overflowing below */
584 * If starting a multi-block I/O near the end of the queue, we might
585 * temporarily need extra space for overflowing buffers before they are
586 * moved to regular circular position. This is the maximum extra space we
592 * Choose the maximum number of buffers we're prepared to pin. We try to
593 * pin fewer if we can, though. We add one so that we can make progress
594 * even if max_ios is set to 0 (see also further down). For max_ios > 0,
595 * this also allows an extra full I/O's worth of buffers: after an I/O
596 * finishes we don't want to have to wait for its buffers to be consumed
597 * before starting a new one.
599 * Be careful not to allow int16 to overflow. That is possible with the
600 * current GUC range limits, so this is an artificial limit of ~32k
601 * buffers and we'd need to adjust the types to exceed that. We also have
602 * to allow for the spare entry and the overflow space.
605 max_pinned_buffers =
Min(max_pinned_buffers,
608 /* Give the strategy a chance to limit the number of buffers we pin. */
610 max_pinned_buffers =
Min(strategy_pin_limit, max_pinned_buffers);
613 * Also limit our queue to the maximum number of pins we could ever be
614 * allowed to acquire according to the buffer manager. We may not really
615 * be able to use them all due to other pins held by this backend, but
616 * we'll check that later in read_stream_start_pending_read().
622 max_pinned_buffers =
Min(max_pinned_buffers, max_possible_buffer_limit);
625 * The limit might be zero on a system configured with too few buffers for
626 * the number of connections. We need at least one to make progress.
628 max_pinned_buffers =
Max(1, max_pinned_buffers);
631 * We need one extra entry for buffers and per-buffer data, because users
632 * of per-buffer data have access to the object until the next call to
633 * read_stream_next_buffer(), so we need a gap between the head and tail
634 * of the queue so that we don't clobber it.
636 queue_size = max_pinned_buffers + 1;
639 * Allocate the object, the buffers, the ios and per_buffer_data space in
640 * one big chunk. Though we have queue_size buffers, we want to be able
641 * to assume that all the buffers for a single read are contiguous (i.e.
642 * don't wrap around halfway through), so we allow temporary overflows of
643 * up to the maximum possible overflow size.
646 size +=
sizeof(
Buffer) * (queue_size + queue_overflow);
648 size += per_buffer_data_size * queue_size;
649 size += MAXIMUM_ALIGNOF * 2;
651 memset(stream, 0, offsetof(
ReadStream, buffers));
654 if (per_buffer_data_size > 0)
664 * Read-ahead advice simulating asynchronous I/O with synchronous calls.
665 * Issue advice only if AIO is not used, direct I/O isn't enabled, the
666 * caller hasn't promised sequential access (overriding our detection
667 * heuristics), and max_ios hasn't been set to zero.
677 * Setting max_ios to zero disables AIO and advice-based pseudo AIO, but
678 * we still need to allocate space to combine and run one I/O. Bump it up
679 * to one, and remember to ask for synchronous I/O only.
688 * Capture stable values for these two GUC-derived numbers for the
689 * lifetime of this stream, so we don't have to worry about the GUCs
690 * changing underneath us beyond this point.
706 * Skip the initial ramp-up phase if the caller says we're going to be
707 * reading the whole relation. This way we start out assuming we'll be
708 * doing full io_combine_limit sized reads.
716 * Since we always access the same relation, we can initialize parts of
717 * the ReadBuffersOperation objects and leave them that way, to avoid
718 * wasting CPU cycles writing to them for each read.
720 for (
int i = 0;
i < max_ios; ++
i)
733 * Create a new read stream for reading a relation.
734 * See read_stream_begin_impl() for the detailed explanation.
742 void *callback_private_data,
743 size_t per_buffer_data_size)
749 rel->
rd_rel->relpersistence,
752 callback_private_data,
753 per_buffer_data_size);
757 * Create a new read stream for reading a SMgr relation.
758 * See read_stream_begin_impl() for the detailed explanation.
764 char smgr_persistence,
767 void *callback_private_data,
768 size_t per_buffer_data_size)
777 callback_private_data,
778 per_buffer_data_size);
782 * Pull one pinned buffer out of a stream. Each call returns successive
783 * blocks in the order specified by the callback. If per_buffer_data_size was
784 * set to a non-zero size, *per_buffer_data receives a pointer to the extra
785 * per-buffer data that the callback had a chance to populate, which remains
786 * valid until the next call to read_stream_next_buffer(). When the stream
787 * runs out of data, InvalidBuffer is returned. The caller may decide to end
788 * the stream early at any time by calling read_stream_end().
794 int16 oldest_buffer_index;
796#ifndef READ_STREAM_DISABLE_FAST_PATH
799 * A fast path for all-cached scans. This is the same as the usual
800 * algorithm, but it is specialized for no I/O and no per-buffer data, so
801 * we can skip the queue management code, stay in the same buffer slot and
802 * use singular StartReadBuffer().
808 /* Fast path assumptions. */
817 /* We're going to return the buffer we pinned last time. */
821 buffer = stream->
buffers[oldest_buffer_index];
824 /* Choose the next block to pin. */
835 * Pin a buffer for the next call. Same buffer entry, and
836 * arbitrary I/O entry (they're all free). We don't have to
837 * adjust pinned_buffers because we're transferring one to caller
838 * but pinning one more.
840 * In the fast path we don't need to check the pin limit. We're
841 * always allowed at least one pin so that progress can be made,
842 * and that's all we need here. Although two pins are momentarily
843 * held at the same time, the model used here is that the stream
844 * holds only one, and the other now belongs to the caller.
847 &stream->
buffers[oldest_buffer_index],
855 /* Next call must wait for I/O for the newly pinned buffer. */
864 /* No more blocks, end of stream. */
880 /* End of stream reached? */
885 * The usual order of operations is that we look ahead at the bottom
886 * of this function after potentially finishing an I/O and making
887 * space for more, but if we're just starting up we'll need to crank
888 * the handle to get started.
892 /* End of stream reached? */
900 /* Grab the oldest pinned buffer and associated per-buffer data. */
903 Assert(oldest_buffer_index >= 0 &&
904 oldest_buffer_index < stream->queue_size);
905 buffer = stream->
buffers[oldest_buffer_index];
911 /* Do we have to wait for an associated I/O first? */
916 int32 distance;
/* wider temporary value, clamped below */
918 /* Sanity check that we still agree on the buffers. */
920 &stream->
buffers[oldest_buffer_index]);
929 /* Look-ahead distance ramps up rapidly after we do I/O. */
935 * If we've reached the first block of a sequential region we're
936 * issuing advice for, cancel that until the next jump. The kernel
937 * will see the sequential preadv() pattern starting here.
945 * We must zap this queue entry, or else it would appear as a forwarded
946 * buffer. If it's potentially in the overflow zone (ie from a
947 * multi-block I/O that wrapped around the queue), also zap the copy.
954#if defined(CLOBBER_FREED_MEMORY) || defined(USE_VALGRIND)
957 * The caller will get access to the per-buffer data, until the next call.
958 * We wipe the one before, which is never occupied because queue_size
959 * allowed one extra element. This will hopefully trip up client code
960 * that is holding a dangling pointer to it.
964 void *per_buffer_data;
967 oldest_buffer_index == 0 ?
969 oldest_buffer_index - 1);
971#if defined(CLOBBER_FREED_MEMORY)
972 /* This also tells Valgrind the memory is "noaccess". */
974#elif defined(USE_VALGRIND)
975 /* Tell it ourselves. */
982 /* Pin transferred to caller. */
986 /* Advance oldest buffer, with wrap-around. */
991 /* Prepare for the next call. */
994#ifndef READ_STREAM_DISABLE_FAST_PATH
995 /* See if we can take the fast path for all-cached scans next time. */
1004 * The fast path spins on one buffer entry repeatedly instead of
1005 * rotating through the whole queue and clearing the entries behind
1006 * it. If the buffer it starts with happened to be forwarded between
1007 * StartReadBuffers() calls and also wrapped around the circular queue
1008 * partway through, then a copy also exists in the overflow zone, and
1009 * it won't clear it out as the regular path would. Do that now, so
1010 * it doesn't need code for that.
1024 * Transitional support for code that would like to perform or skip reads
1025 * itself, without using the stream. Returns, and consumes, the next block
1026 * number that would be read by the stream's look-ahead algorithm, or
1027 * InvalidBlockNumber if the end of the stream is reached. Also reports the
1028 * strategy that would be used to read it.
1038 * Reset a read stream by releasing any queued up buffers, allowing the stream
1039 * to be used again for different blocks. This can be used to clear an
1040 * end-of-stream condition and start again, or to throw away blocks that were
1041 * speculatively read and read some different blocks instead.
1049 /* Stop looking ahead. */
1052 /* Forget buffered block number and fast path state. */
1056 /* Unpin anything that wasn't consumed. */
1060 /* Unpin any unused forwarded buffers. */
1081 /* Start off assuming data is cached. */
1086 * Release and free a read stream.
void pgaio_enter_batchmode(void)
void pgaio_exit_batchmode(void)
#define InvalidBlockNumber
BlockNumber BufferGetBlockNumber(Buffer buffer)
bool StartReadBuffers(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
void ReleaseBuffer(Buffer buffer)
void WaitReadBuffers(ReadBuffersOperation *operation)
int effective_io_concurrency
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
uint32 GetAdditionalPinLimit(void)
#define READ_BUFFERS_ISSUE_ADVICE
#define READ_BUFFERS_SYNCHRONOUSLY
static bool BufferIsValid(Buffer bufnum)
#define FLEXIBLE_ARRAY_MEMBER
#define OidIsValid(objectId)
bool IsCatalogRelation(Relation relation)
bool IsCatalogRelationOid(Oid relid)
int GetAccessStrategyPinLimit(BufferAccessStrategy strategy)
Assert(PointerIsAligned(start, uint64))
if(TABLE==NULL||TABLE_index==NULL)
uint32 GetAdditionalLocalPinLimit(void)
uint32 GetLocalPinLimit(void)
void pfree(void *pointer)
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
#define VALGRIND_MAKE_MEM_UNDEFINED(addr, size)
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
BlockNumber read_stream_next_block(ReadStream *stream, BufferAccessStrategy *strategy)
void read_stream_reset(ReadStream *stream)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
static void * get_per_buffer_data(ReadStream *stream, int16 buffer_index)
static bool read_stream_start_pending_read(ReadStream *stream)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
struct InProgressIO InProgressIO
static void read_stream_look_ahead(ReadStream *stream)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
static BlockNumber read_stream_get_block(ReadStream *stream, void *per_buffer_data)
static void read_stream_unget_block(ReadStream *stream, BlockNumber blocknum)
static ReadStream * read_stream_begin_impl(int flags, BufferAccessStrategy strategy, Relation rel, SMgrRelation smgr, char persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
#define READ_STREAM_MAINTENANCE
#define READ_STREAM_USE_BATCHING
BlockNumber(* ReadStreamBlockNumberCB)(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_SEQUENTIAL
static SMgrRelation RelationGetSmgr(Relation rel)
int get_tablespace_io_concurrency(Oid spcid)
int get_tablespace_maintenance_io_concurrency(Oid spcid)
BlockNumber last_exclusive
BlockNumber current_blocknum
BufferAccessStrategy strategy
BlockNumber seq_until_processed
int16 oldest_buffer_index
BlockNumber pending_read_blocknum
BlockNumber buffered_blocknum
int16 initialized_buffers
size_t per_buffer_data_size
ReadStreamBlockNumberCB callback
int16 pending_read_nblocks
void * callback_private_data
Buffer buffers[FLEXIBLE_ARRAY_MEMBER]
RelFileLocatorBackend smgr_rlocator
static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)