1/*-------------------------------------------------------------------------
3 * pg_buffercache_pages.c
4 * display some contents of the buffer cache
6 * contrib/pg_buffercache/pg_buffercache_pages.c
7 *-------------------------------------------------------------------------
21 #define NUM_BUFFERCACHE_PAGES_MIN_ELEM 8
22 #define NUM_BUFFERCACHE_PAGES_ELEM 9
23 #define NUM_BUFFERCACHE_SUMMARY_ELEM 5
24 #define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM 4
25 #define NUM_BUFFERCACHE_EVICT_ELEM 2
26 #define NUM_BUFFERCACHE_EVICT_RELATION_ELEM 3
27 #define NUM_BUFFERCACHE_EVICT_ALL_ELEM 3
29 #define NUM_BUFFERCACHE_NUMA_ELEM 3
32 .
name =
"pg_buffercache",
37 * Record structure holding the to be exposed cache data.
52 * An int32 is sufficiently large, as MAX_BACKENDS prevents a buffer from
53 * being pinned by too many backends and each backend will only pin once
54 * because of bufmgr.c's PrivateRefCount infrastructure.
61 * Function context for data persisting over repeated calls.
70 * Record structure holding the to be exposed cache data.
80 * Function context for data persisting over repeated calls.
93 * Function returning data from the shared buffer cache - buffer number,
94 * relation node/tablespace/database/blocknum and dirty indicator.
105/* Only need to touch memory once per backend process lifetime */
126 /* Switch context when allocating stuff to be used in later calls */
129 /* Create a user function context for cross-call persistence */
133 * To smoothly support upgrades from version 1.0 of this extension
134 * transparently handle the (non-)existence of the pinning_backends
135 * column. We unfortunately have to get the result type for that... -
136 * we can't use the result type determined by the function definition
137 * without potentially crashing when somebody uses the old (or even
138 * wrong) function definition though.
141 elog(
ERROR,
"return type must be a row type");
145 elog(
ERROR,
"incorrect number of output arguments");
147 /* Construct a tuple descriptor for the result rows. */
172 /* Allocate NBuffers worth of BufferCachePagesRec records. */
177 /* Set max calls and remember the user function context. */
181 /* Return to original context when allocating transient memory */
185 * Scan through all the buffers, saving the relevant fields in the
186 * fctx->record structure.
188 * We don't hold the partition locks, so we don't get a consistent
189 * snapshot across all buffers, but we do grab the buffer header
190 * locks, so the information of each buffer is self-consistent.
200 /* Lock each buffer header before inspecting. */
217 /* Note if the buffer is valid, and has storage created */
229 /* Get the saved state */
242 * Set all fields except the bufferid to null if the buffer is unused
255 /* unused for v1.0 callers, but the array is always long enough */
274 /* unused for v1.0 callers, but the array is always long enough */
279 /* Build and return the tuple. */
290 * Inquire about NUMA memory mappings for shared buffers.
292 * Returns NUMA node ID for each memory page used by the buffer. Buffers may
293 * be smaller or larger than OS memory pages. For each buffer we return one
294 * entry for each memory page used by the buffer (if the buffer is smaller,
295 * it only uses a part of one memory page).
297 * We expect both sizes (for buffers and memory pages) to be a power-of-2, so
298 * one is always a multiple of the other.
300 * In order to get reliable results we also need to touch memory pages, so
301 * that the inquiry about NUMA memory node doesn't return -2 (which indicates
302 * unmapped/unallocated pages).
323 int pages_per_buffer;
329 elog(
ERROR,
"libnuma initialization failed or NUMA is not supported on this platform");
332 * The database block size and OS memory page size are unlikely to be
333 * the same. The block size is 1-32KB, the memory page size depends on
334 * platform. On x86 it's usually 4KB, on ARM it's 4KB or 64KB, but
335 * there are also features like THP etc. Moreover, we don't quite know
336 * how the pages and buffers "align" in memory - the buffers may be
337 * shifted in some way, using more memory pages than necessary.
339 * So we need to be careful about mapping buffers to memory pages. We
340 * calculate the maximum number of pages a buffer might use, so that
341 * we allocate enough space for the entries. And then we count the
342 * actual number of entries as we scan the buffers.
344 * This information is needed before calling move_pages() for NUMA
350 * The pages and block size is expected to be 2^k, so one divides the
351 * other (we don't know in which direction). This does not say
352 * anything about relative alignment of pages/buffers.
354 Assert((os_page_size % BLCKSZ == 0) || (BLCKSZ % os_page_size == 0));
357 * How many addresses we are going to query? Simply get the page for
358 * the first buffer, and first page after the last buffer, and count
359 * the pages from that.
363 endptr = (
char *)
TYPEALIGN(os_page_size,
365 os_page_count = (endptr - startptr) / os_page_size;
367 /* Used to determine the NUMA node for all OS pages at once */
368 os_page_ptrs =
palloc0(
sizeof(
void *) * os_page_count);
369 os_page_status =
palloc(
sizeof(
uint64) * os_page_count);
371 /* Fill pointers for all the memory pages. */
373 for (
char *ptr = startptr; ptr < endptr; ptr += os_page_size)
375 os_page_ptrs[
idx++] = ptr;
377 /* Only need to touch memory once per backend process lifetime */
385 "os_page_size=%zu",
NBuffers, os_page_count, os_page_size);
388 * If we ever get 0xff back from kernel inquiry, then we probably have
389 * bug in our buffers to OS page mapping code here.
391 memset(os_page_status, 0xff,
sizeof(
int) * os_page_count);
393 /* Query NUMA status for all the pointers */
395 elog(
ERROR,
"failed NUMA pages inquiry: %m");
397 /* Initialize the multi-call context, load entries about buffers */
401 /* Switch context when allocating stuff to be used in later calls */
404 /* Create a user function context for cross-call persistence */
408 elog(
ERROR,
"return type must be a row type");
411 elog(
ERROR,
"incorrect number of output arguments");
413 /* Construct a tuple descriptor for the result rows. */
425 * Each buffer needs at least one entry, but it might be offset in
426 * some way, and use one extra entry. So we allocate space for the
427 * maximum number of entries we might need, and then count the exact
428 * number as we're walking buffers. That way we can do it in one pass,
429 * without reallocating memory.
431 pages_per_buffer =
Max(1, BLCKSZ / os_page_size) + 1;
432 max_entries =
NBuffers * pages_per_buffer;
434 /* Allocate entries for BufferCachePagesRec records. */
439 /* Return to original context when allocating transient memory */
443 elog(
DEBUG1,
"NUMA: page-faulting the buffercache for proper NUMA readouts");
446 * Scan through all the buffers, saving the relevant fields in the
447 * fctx->record structure.
449 * We don't hold the partition locks, so we don't get a consistent
450 * snapshot across all buffers, but we do grab the buffer header
451 * locks, so the information of each buffer is self-consistent.
453 * This loop touches and stores addresses into os_page_ptrs[] as input
454 * to one big move_pages(2) inquiry system call. Basically we ask for
455 * all memory pages for NBuffers.
473 /* Lock each buffer header before inspecting. */
478 /* start of the first page of this buffer */
481 /* end of the buffer (no need to align to memory page) */
482 endptr_buff = buffptr + BLCKSZ;
484 Assert(startptr_buff < endptr_buff);
486 /* calculate ID of the first page for this buffer */
487 page_num = (startptr_buff - startptr) / os_page_size;
489 /* Add an entry for each OS page overlapping with this buffer. */
490 for (
char *ptr = startptr_buff; ptr < endptr_buff; ptr += os_page_size)
496 /* advance to the next entry/page */
504 /* Set max calls and remember the user function context. */
508 /* Remember this backend touched the pages */
514 /* Get the saved state */
532 /* Build and return the tuple. */
551 int32 buffers_used = 0;
552 int32 buffers_unused = 0;
553 int32 buffers_dirty = 0;
554 int32 buffers_pinned = 0;
555 int64 usagecount_total = 0;
558 elog(
ERROR,
"return type must be a row type");
568 * This function summarizes the state of all headers. Locking the
569 * buffer headers wouldn't provide an improved result as the state of
570 * the buffer can still change after we release the lock and it'd
571 * noticeably increase the cost of the function.
591 memset(nulls, 0,
sizeof(nulls));
597 if (buffers_used != 0)
602 /* Build and return the tuple. */
630 usage_counts[usage_count]++;
633 dirty[usage_count]++;
636 pinned[usage_count]++;
653 * Helper function to check if the user has superuser privileges.
660 (
errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
661 errmsg(
"must be superuser to use %s()",
666 * Try to evict a shared buffer.
681 elog(
ERROR,
"return type must be a row type");
698 * Try to evict specified relation.
712 int32 buffers_evicted = 0;
713 int32 buffers_flushed = 0;
714 int32 buffers_skipped = 0;
717 elog(
ERROR,
"return type must be a row type");
727 (
errcode(ERRCODE_INVALID_PARAMETER_VALUE),
728 errmsg(
"relation uses local buffers, %s() is intended to be used for shared buffers only",
729 "pg_buffercache_evict_relation")));
748 * Try to evict all shared buffers.
759 int32 buffers_evicted = 0;
760 int32 buffers_flushed = 0;
761 int32 buffers_skipped = 0;
764 elog(
ERROR,
"return type must be a row type");
Datum idx(PG_FUNCTION_ARGS)
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
#define InvalidBlockNumber
static Datum values[MAXATTR]
#define BM_MAX_USAGE_COUNT
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BUF_STATE_GET_USAGECOUNT(state)
#define BUF_STATE_GET_REFCOUNT(state)
static BufferDesc * GetBufferDescriptor(uint32 id)
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
void EvictAllUnpinnedBuffers(int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
bool EvictUnpinnedBuffer(Buffer buf, bool *buffer_flushed)
uint32 LockBufHdr(BufferDesc *desc)
static Block BufferGetBlock(Buffer buffer)
#define TYPEALIGN(ALIGNVAL, LEN)
#define TYPEALIGN_DOWN(ALIGNVAL, LEN)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
TupleDesc BlessTupleDesc(TupleDesc tupdesc)
#define PG_GETARG_INT32(n)
#define PG_RETURN_DATUM(x)
void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
TypeFuncClass get_call_result_type(FunctionCallInfo fcinfo, Oid *resultTypeId, TupleDesc *resultTupleDesc)
#define SRF_IS_FIRSTCALL()
#define SRF_PERCALL_SETUP()
#define SRF_RETURN_NEXT(_funcctx, _result)
#define SRF_FIRSTCALL_INIT()
static Datum HeapTupleGetDatum(const HeapTupleData *tuple)
#define SRF_RETURN_DONE(_funcctx)
Assert(PointerIsAligned(start, uint64))
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
void * palloc0(Size size)
MemoryContext CurrentMemoryContext
void * MemoryContextAllocHuge(MemoryContext context, Size size)
#define CHECK_FOR_INTERRUPTS()
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
PG_MODULE_MAGIC_EXT(.name="pg_buffercache",.version=PG_VERSION)
Datum pg_buffercache_evict_relation(PG_FUNCTION_ARGS)
#define NUM_BUFFERCACHE_USAGE_COUNTS_ELEM
Datum pg_buffercache_evict(PG_FUNCTION_ARGS)
Datum pg_buffercache_summary(PG_FUNCTION_ARGS)
static void pg_buffercache_superuser_check(char *func_name)
PG_FUNCTION_INFO_V1(pg_buffercache_pages)
Datum pg_buffercache_usage_counts(PG_FUNCTION_ARGS)
#define NUM_BUFFERCACHE_SUMMARY_ELEM
Datum pg_buffercache_pages(PG_FUNCTION_ARGS)
#define NUM_BUFFERCACHE_EVICT_ELEM
#define NUM_BUFFERCACHE_PAGES_MIN_ELEM
#define NUM_BUFFERCACHE_EVICT_ALL_ELEM
Datum pg_buffercache_evict_all(PG_FUNCTION_ARGS)
#define NUM_BUFFERCACHE_PAGES_ELEM
Datum pg_buffercache_numa_pages(PG_FUNCTION_ARGS)
static bool firstNumaTouch
#define NUM_BUFFERCACHE_NUMA_ELEM
#define NUM_BUFFERCACHE_EVICT_RELATION_ELEM
#define pg_numa_touch_mem_if_required(ptr)
PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
PGDLLIMPORT int pg_numa_init(void)
static Datum Int64GetDatum(int64 X)
static Datum Int16GetDatum(int16 X)
static Datum BoolGetDatum(bool X)
static Datum ObjectIdGetDatum(Oid X)
static Datum Float8GetDatum(float8 X)
static Datum Int32GetDatum(int32 X)
#define RelationUsesLocalBuffers(relation)
Size pg_get_shmem_pagesize(void)
void relation_close(Relation relation, LOCKMODE lockmode)
Relation relation_open(Oid relationId, LOCKMODE lockmode)
BufferCacheNumaRec * record
BufferCachePagesRec * record
RelFileNumber relfilenumber
MemoryContext multi_call_memory_ctx
TupleDesc CreateTemplateTupleDesc(int natts)
void TupleDescInitEntry(TupleDesc desc, AttrNumber attributeNumber, const char *attributeName, Oid oidtypeid, int32 typmod, int attdim)
void tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc, const Datum *values, const bool *isnull)