1/*-------------------------------------------------------------------------
4 * Internal definitions for buffer manager and the buffer replacement
8 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/include/storage/buf_internals.h
13 *-------------------------------------------------------------------------
15#ifndef BUFMGR_INTERNALS_H
16#define BUFMGR_INTERNALS_H
33 * Buffer state is a single 32-bit variable where following data is combined.
36 * - 4 bits usage count
39 * Combining these values allows to perform some operations without locking
40 * the buffer header, by modifying them together with a CAS loop.
42 * The definition of buffer state components is below.
44 #define BUF_REFCOUNT_BITS 18
45 #define BUF_USAGECOUNT_BITS 4
46 #define BUF_FLAG_BITS 10
49 "parts of buffer state space need to equal 32");
51 #define BUF_REFCOUNT_ONE 1
52 #define BUF_REFCOUNT_MASK ((1U << BUF_REFCOUNT_BITS) - 1)
53 #define BUF_USAGECOUNT_MASK (((1U << BUF_USAGECOUNT_BITS) - 1) << (BUF_REFCOUNT_BITS))
54 #define BUF_USAGECOUNT_ONE (1U << BUF_REFCOUNT_BITS)
55 #define BUF_USAGECOUNT_SHIFT BUF_REFCOUNT_BITS
56 #define BUF_FLAG_MASK (((1U << BUF_FLAG_BITS) - 1) << (BUF_REFCOUNT_BITS + BUF_USAGECOUNT_BITS))
58/* Get refcount and usagecount from buffer state */
59 #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
60 #define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
63 * Flags for buffer descriptors
65 * Note: BM_TAG_VALID essentially means that there is a buffer hashtable
66 * entry associated with the buffer's tag.
68 #define BM_LOCKED (1U << 22) /* buffer header is locked */
69 #define BM_DIRTY (1U << 23) /* data needs writing */
70 #define BM_VALID (1U << 24) /* data is valid */
71 #define BM_TAG_VALID (1U << 25) /* tag is assigned */
72 #define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
73 #define BM_IO_ERROR (1U << 27) /* previous I/O failed */
74 #define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
75 #define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
76 #define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
77 #define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
80 * The maximum allowed value of usage_count represents a tradeoff between
81 * accuracy and speed of the clock-sweep buffer management algorithm. A
82 * large value (comparable to NBuffers) would approximate LRU semantics.
83 * But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of the
84 * clock-sweep hand to find a free buffer, so in practice we don't want the
85 * value to be very large.
87#define BM_MAX_USAGE_COUNT 5
90 "BM_MAX_USAGE_COUNT doesn't fit in BUF_USAGECOUNT_BITS bits");
92 "MAX_BACKENDS_BITS needs to be <= BUF_REFCOUNT_BITS");
95 * Buffer tag identifies which disk block the buffer contains.
97 * Note: the BufferTag data must be sufficient to determine where to write the
98 * block, without reference to pg_class or pg_tablespace entries. It's
99 * possible that the backend flushing the buffer doesn't even believe the
100 * relation is visible yet (its xact may have started before the xact that
101 * created the rel). The storage manager must be able to cope anyway.
103 * Note: if there's any pad bytes in the struct, InitBufferTag will have
104 * to be fixed to zero them, since this struct is used as a hash key.
187 * The shared buffer mapping table is partitioned to reduce contention.
188 * To determine which partition lock a given tag requires, compute the tag's
189 * hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
190 * NB: NUM_BUFFER_PARTITIONS must be a power of 2!
212 * BufferDesc -- shared descriptor/state data for a single shared buffer.
214 * Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
215 * tag, state or wait_backend_pgprocno fields. In general, buffer header lock
216 * is a spinlock which is combined with flags, refcount and usagecount into
217 * single atomic variable. This layout allow us to do some operations in a
218 * single atomic operation, without actually acquiring and releasing spinlock;
219 * for instance, increase or decrease refcount. buf_id field never changes
220 * after initialization, so does not need locking. The LWLock can take care
221 * of itself. The buffer header lock is *not* used to control access to the
222 * data in the buffer!
224 * It's assumed that nobody changes the state field while buffer header lock
225 * is held. Thus buffer header lock holder can do complex updates of the
226 * state variable in single write, simultaneously with lock release (cleaning
227 * BM_LOCKED flag). On the other hand, updating of state without holding
228 * buffer header lock is restricted to CAS, which ensures that BM_LOCKED flag
229 * is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
231 * An exception is that if we have the buffer pinned, its tag can't change
232 * underneath us, so we can examine the tag without locking the buffer header.
233 * Also, in places we do one-time reads of the flags without bothering to
234 * lock the buffer header; this is generally for situations where we don't
235 * expect the flag bit being tested to be changing.
237 * We can't physically remove items from a disk page if another backend has
238 * the buffer pinned. Hence, a backend may need to wait for all other pins
239 * to go away. This is signaled by storing its own pgprocno into
240 * wait_backend_pgprocno and setting flag bit BM_PIN_COUNT_WAITER. At present,
241 * there can be only one such waiter per buffer.
243 * We use this same struct for local buffer headers, but the locks are not
244 * used and not all of the flag bits are useful either. To avoid unnecessary
245 * overhead, manipulations of the state field should be done without actual
246 * atomic operations (i.e. only pg_atomic_read_u32() and
247 * pg_atomic_unlocked_write_u32()).
249 * Be careful to avoid increasing the size of the struct when adding or
250 * reordering members. Keeping it below 64 bytes (the most common CPU
251 * cache line size) is fairly important for performance.
253 * Per-buffer I/O condition variables are currently kept outside this struct in
254 * a separate array. They could be moved in here and still fit within that
255 * limit on common systems, but for now that is not done.
260 int buf_id;
/* buffer's index number (from 0) */
262 /* state of the tag, containing flags, refcount and usagecount */
272 * Concurrent access to buffer headers has proven to be more efficient if
273 * they're cache line aligned. So we force the start of the BufferDescriptors
274 * array to be on a cache line boundary and force the elements to be cache
277 * XXX: As this is primarily matters in highly concurrent workloads which
278 * probably all are 64bit these days, and the space wastage would be a bit
279 * more noticeable on 32bit systems, we don't force the stride to be cache
280 * line sized on those. If somebody does actual performance testing, we can
283 * Note that local buffer descriptors aren't forced to be aligned - as there's
284 * no concurrent access to those it's unlikely to be beneficial.
286 * We use a 64-byte cache line size here, because that's the most common
287 * size. Making it bigger would be a waste of memory. Even if running on a
288 * platform with either 32 or 128 byte line sizes, it's good to align to
289 * boundaries and avoid false sharing.
291#define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
300 * The PendingWriteback & WritebackContext structure are used to keep
301 * information about pending flush requests to be issued to the OS.
305 /* could store different types of pending flushes here */
309 /* struct forward declared in bufmgr.h */
312 /* pointer to the max number of writeback requests to coalesce */
315 /* current number of pending writeback requests */
318 /* pending requests */
362 * Functions for acquiring/releasing a shared buffer header's spinlock. Do
363 * not apply these to local buffers!
377 * Structure to sort buffers per file on checkpoints.
379 * This structure is allocated per buffer in shared memory, so it should be
380 * kept as small as possible.
393/* ResourceOwner callbacks to hold buffer I/Os and pins */
397/* Convenience wrappers over ResourceOwnerRemember/Forget */
420 * Internal buffer management routines
428/* solely to make it easier to write tests */
431 bool forget_owner,
bool release_aio);
437 uint32 *buf_state,
bool *from_ring);
473 uint32 set_flag_bits,
bool release_aio);
483#endif /* BUFMGR_INTERNALS_H */
#define pg_write_barrier()
static void pg_atomic_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
#define InvalidBlockNumber
#define BM_MAX_USAGE_COUNT
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
struct CkptSortItem CkptSortItem
static uint32 BufTableHashPartition(uint32 hashcode)
static LWLock * BufMappingPartitionLockByIndex(uint32 index)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
union BufferDescPadded BufferDescPadded
void UnpinLocalBuffer(Buffer buffer)
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
StaticAssertDecl(BUF_REFCOUNT_BITS+BUF_USAGECOUNT_BITS+BUF_FLAG_BITS==32, "parts of buffer state space need to equal 32")
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
struct BufferDesc BufferDesc
void AtEOXact_LocalBuffers(bool isCommit)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_REFCOUNT_BITS
struct WritebackContext WritebackContext
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
struct PendingWriteback PendingWriteback
static void BufTagSetRelForkDetails(BufferTag *tag, RelFileNumber relnumber, ForkNumber forknum)
void InitBufTable(int size)
PGDLLIMPORT const ResourceOwnerDesc buffer_io_resowner_desc
void StrategyInitialize(bool init)
#define BUF_USAGECOUNT_BITS
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)
void MarkLocalBufferDirty(Buffer buffer)
#define BUFFERDESC_PAD_TO_SIZE
PGDLLIMPORT WritebackContext BackendWritebackContext
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
Size BufTableShmemSize(int size)
uint32 BufTableHashCode(BufferTag *tagPtr)
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio)
void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced)
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
static void ClearBufferTag(BufferTag *tag)
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)
void WritebackContextInit(WritebackContext *context, int *max_pending)
void StrategyNotifyBgWriter(int bgwprocno)
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
PGDLLIMPORT BufferDescPadded * BufferDescriptors
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
PGDLLIMPORT ConditionVariableMinimallyPadded * BufferIOCVArray
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
PGDLLIMPORT CkptSortItem * CkptBufferIds
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
void UnpinLocalBufferNoOwner(Buffer buffer)
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
static LWLock * BufMappingPartitionLock(uint32 hashcode)
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
Size StrategyShmemSize(void)
PGDLLIMPORT BufferDesc * LocalBufferDescriptors
PGDLLIMPORT const ResourceOwnerDesc buffer_pin_resowner_desc
uint32 LockBufHdr(BufferDesc *desc)
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
static BufferDesc * GetBufferDescriptor(uint32 id)
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
LWLockPadded * MainLWLockArray
#define BUFFER_MAPPING_LWLOCK_OFFSET
#define NUM_BUFFER_PARTITIONS
#define WRITEBACK_MAX_PENDING_FLUSHES
static Datum Int32GetDatum(int32 X)
#define MAX_BACKENDS_BITS
#define InvalidRelFileNumber
void ResourceOwnerForget(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
void ResourceOwnerRemember(ResourceOwner owner, Datum value, const ResourceOwnerDesc *kind)
int wait_backend_pgprocno
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]
char pad[BUFFERDESC_PAD_TO_SIZE]