1/*-------------------------------------------------------------------------
4 * manage dynamic shared memory segments
6 * This file provides low-level APIs for creating and destroying shared
7 * memory segments using several different possible techniques. We refer
8 * to these segments as dynamic because they can be created, altered, and
9 * destroyed at any point during the server life cycle. This is unlike
10 * the main shared memory segment, of which there is always exactly one
11 * and which is always mapped at a fixed address in every PostgreSQL
14 * Because not all systems provide the same primitives in this area, nor
15 * do all primitives behave the same way on all systems, we provide
16 * several implementations of this facility. Many systems implement
17 * POSIX shared memory (shm_open etc.), which is well-suited to our needs
18 * in this area, with the exception that shared memory identifiers live
19 * in a flat system-wide namespace, raising the uncomfortable prospect of
20 * name collisions with other processes (including other copies of
21 * PostgreSQL) running on the same system. Some systems only support
22 * the older System V shared memory interface (shmget etc.) which is
23 * also usable; however, the default allocation limits are often quite
24 * small, and the namespace is even more restricted.
26 * We also provide an mmap-based shared memory implementation. This may
27 * be useful on systems that provide shared memory via a special-purpose
28 * filesystem; by opting for this implementation, the user can even
29 * control precisely where their shared memory segments are placed. It
30 * can also be used as a fallback for systems where shm_open and shmget
31 * are not available or can't be used for some reason. Of course,
32 * mapping a file residing on an actual spinning disk is a fairly poor
33 * approximation for shared memory because writeback may hurt performance
34 * substantially, but there should be few systems where we must make do
35 * with such poor tools.
37 * As ever, Windows requires its own implementation.
39 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
40 * Portions Copyright (c) 1994, Regents of the University of California
44 * src/backend/storage/ipc/dsm_impl.c
46 *-------------------------------------------------------------------------
74 void **impl_private,
void **mapped_address,
75 Size *mapped_size,
int elevel);
80 void **impl_private,
void **mapped_address,
81 Size *mapped_size,
int elevel);
85 void **impl_private,
void **mapped_address,
86 Size *mapped_size,
int elevel);
90 void **impl_private,
void **mapped_address,
91 Size *mapped_size,
int elevel);
102#ifdef USE_DSM_WINDOWS
111/* Implementation selector. */
114/* Amount of space reserved for DSM segments in the main area. */
117/* Size of buffer to be used for zero-filling. */
118 #define ZBUFFER_SIZE 8192
120 #define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
123 * Perform a low-level shared memory operation in a platform-specific way,
124 * as dictated by the selected implementation. Each implementation is
125 * required to implement the following primitives.
127 * DSM_OP_CREATE. Create a segment whose size is the request_size and
130 * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
132 * DSM_OP_DETACH. Unmap the segment.
134 * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
138 * op: The operation to be performed.
139 * handle: The handle of an existing object, or for DSM_OP_CREATE, the
140 * identifier for the new handle the caller wants created.
141 * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
142 * impl_private: Private, implementation-specific data. Will be a pointer
143 * to NULL for the first operation on a shared memory segment within this
144 * backend; thereafter, it will point to the value to which it was set
145 * on the previous call.
146 * mapped_address: Pointer to start of current mapping; pointer to NULL
147 * if none. Updated with new mapping address.
148 * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
149 * Updated with new mapped size.
150 * elevel: Level at which to log errors.
152 * Return value: true on success, false on failure. When false is returned,
153 * a message should first be logged at the specified elevel, except in the
154 * case where DSM_OP_CREATE experiences a name collision, which should
155 * silently return false.
160 void **impl_private,
void **mapped_address,
Size *mapped_size,
165 (*mapped_address == NULL && *mapped_size == 0));
172 mapped_address, mapped_size, elevel);
177 mapped_address, mapped_size, elevel);
179#ifdef USE_DSM_WINDOWS
181 return dsm_impl_windows(op, handle, request_size, impl_private,
182 mapped_address, mapped_size, elevel);
187 mapped_address, mapped_size, elevel);
190 elog(
ERROR,
"unexpected dynamic shared memory type: %d",
198 * Operating system primitives to support POSIX shared memory.
200 * POSIX shared memory segments are created and attached using shm_open()
201 * and shm_unlink(); other operations, such as sizing or mapping the
202 * segment, are performed as if the shared memory segments were files.
204 * Indeed, on some platforms, they may be implemented that way. While
205 * POSIX shared memory segments seem intended to exist in a flat namespace,
206 * some operating systems may implement them as files, even going so far
207 * to treat a request for /xyz as a request to create a file by that name
208 * in the root directory. Users of such broken platforms should select
209 * a different shared memory implementation.
213 void **impl_private,
void **mapped_address,
Size *mapped_size,
223 /* Handle teardown cases. */
226 if (*mapped_address != NULL
227 && munmap(*mapped_address, *mapped_size) != 0)
231 errmsg(
"could not unmap shared memory segment \"%s\": %m",
235 *mapped_address = NULL;
241 errmsg(
"could not remove shared memory segment \"%s\": %m",
249 * Create new segment or open an existing one for attach.
251 * Even though we will close the FD before returning, it seems desirable
252 * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
253 * failure. The fact that we won't hold the FD open long justifies using
254 * ReserveExternalFD rather than AcquireExternalFD, though.
258 flags = O_RDWR | (op ==
DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
265 errmsg(
"could not open shared memory segment \"%s\": %m",
271 * If we're attaching the segment, determine the current size; if we are
272 * creating the segment, set the size to the requested value.
282 /* Back out what's already been done. */
290 errmsg(
"could not stat shared memory segment \"%s\": %m",
300 /* Back out what's already been done. */
309 errmsg(
"could not resize shared memory segment \"%s\" to %zu bytes: %m",
310 name, request_size)));
315 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
321 /* Back out what's already been done. */
331 errmsg(
"could not map shared memory segment \"%s\": %m",
335 *mapped_address = address;
336 *mapped_size = request_size;
344 * Set the size of a virtual memory region associated with a file descriptor.
345 * If necessary, also ensure that virtual memory is actually allocated by the
346 * operating system, to avoid nasty surprises later.
348 * Returns non-zero if either truncation or allocation fails, and sets errno.
355 sigset_t save_sigmask;
358 * Block all blockable signals, except SIGQUIT. posix_fallocate() can run
359 * for quite a long time, and is an all-or-nothing operation. If we
360 * allowed SIGUSR1 to interrupt us repeatedly (for example, due to
361 * recovery conflicts), the retry loop might never succeed.
364 sigprocmask(SIG_SETMASK, &
BlockSig, &save_sigmask);
367#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
370 * On Linux, a shm_open fd is backed by a tmpfs file. If we were to use
371 * ftruncate, the file would contain a hole. Accessing memory backed by a
372 * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
373 * is no more tmpfs space available. So we ask tmpfs to allocate pages
374 * here, so we can fail gracefully with ENOSPC now rather than risking
377 * We still use a traditional EINTR retry loop to handle SIGCONT.
378 * posix_fallocate() doesn't restart automatically, and we don't want this
379 * to fail if you attach a debugger.
383 rc = posix_fallocate(
fd, 0, size);
384 }
while (rc ==
EINTR);
387 * The caller expects errno to be set, but posix_fallocate() doesn't set
388 * it. Instead it returns error numbers directly. So set errno, even
389 * though we'll also return rc to indicate success or failure.
393 /* Extend the file to the requested size. */
396 rc = ftruncate(
fd, size);
397 }
while (rc < 0 && errno ==
EINTR);
404 sigprocmask(SIG_SETMASK, &save_sigmask, NULL);
411#endif /* USE_DSM_POSIX */
415 * Operating system primitives to support System V shared memory.
417 * System V shared memory segments are manipulated using shmget(), shmat(),
418 * shmdt(), and shmctl(). As the default allocation limits for System V
419 * shared memory are usually quite low, the POSIX facilities may be
420 * preferable; but those are not supported everywhere.
424 void **impl_private,
void **mapped_address,
Size *mapped_size,
434 * POSIX shared memory and mmap-based shared memory identify segments with
435 * names. To avoid needless error message variation, we use the handle as
441 * The System V shared memory namespace is very restricted; names are of
442 * type key_t, which is expected to be some sort of integer data type, but
443 * not necessarily the same one as dsm_handle. Since we use dsm_handle to
444 * identify shared memory segments across processes, this might seem like
445 * a problem, but it's really not. If dsm_handle is bigger than key_t,
446 * the cast below might truncate away some bits from the handle the
447 * user-provided, but it'll truncate exactly the same bits away in exactly
448 * the same fashion every time we use that handle, which is all that
449 * really matters. Conversely, if dsm_handle is smaller than key_t, we
450 * won't use the full range of available key space, but that's no big deal
453 * We do make sure that the key isn't negative, because that might not be
457 if (
key < 1)
/* avoid compiler warning if type is unsigned */
461 * There's one special key, IPC_PRIVATE, which can't be used. If we end
462 * up with that value by chance during a create operation, just pretend it
463 * already exists, so that caller will retry. If we run into it anywhere
464 * else, the caller has passed a handle that doesn't correspond to
465 * anything we ever created, which should not happen.
470 elog(
DEBUG4,
"System V shared memory key may not be IPC_PRIVATE");
476 * Before we can do anything with a shared memory segment, we have to map
477 * the shared memory key to a shared memory identifier using shmget(). To
478 * avoid repeated lookups, we store the key using impl_private.
480 if (*impl_private != NULL)
482 ident_cache = *impl_private;
483 ident = *ident_cache;
491 * Allocate the memory BEFORE acquiring the resource, so that we don't
492 * leak the resource if memory allocation fails.
497 * When using shmget to find an existing segment, we must pass the
498 * size as 0. Passing a non-zero size which is greater than the
499 * actual size will result in EINVAL.
506 segsize = request_size;
509 if ((
ident = shmget(
key, segsize, flags)) == -1)
513 int save_errno = errno;
519 errmsg(
"could not get shared memory segment: %m")));
524 *ident_cache =
ident;
525 *impl_private = ident_cache;
528 /* Handle teardown cases. */
532 *impl_private = NULL;
533 if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
537 errmsg(
"could not unmap shared memory segment \"%s\": %m",
541 *mapped_address = NULL;
547 errmsg(
"could not remove shared memory segment \"%s\": %m",
554 /* If we're attaching it, we must use IPC_STAT to determine the size. */
563 errmsg(
"could not stat shared memory segment \"%s\": %m",
567 request_size = shm.shm_segsz;
572 if (address == (
void *) -1)
576 /* Back out what's already been done. */
584 errmsg(
"could not map shared memory segment \"%s\": %m",
588 *mapped_address = address;
589 *mapped_size = request_size;
595#ifdef USE_DSM_WINDOWS
597 * Operating system primitives to support Windows shared memory.
599 * Windows shared memory implementation is done using file mapping
600 * which can be backed by either physical file or system paging file.
601 * Current implementation uses system paging file as other effects
602 * like performance are not clear for physical file and it is used in similar
603 * way for main shared memory in windows.
605 * A memory mapping object is a kernel object - they always get deleted when
606 * the last reference to them goes away, either explicitly via a CloseHandle or
607 * when the process containing the reference exits.
611 void **impl_private,
void **mapped_address,
612 Size *mapped_size,
int elevel)
617 MEMORY_BASIC_INFORMATION info;
620 * Storing the shared memory segment in the Global\ namespace, can allow
621 * any process running in any session to access that file mapping object
622 * provided that the caller has the required access rights. But to avoid
623 * issues faced in main shared memory, we are using the naming convention
624 * similar to main shared memory. We can change here once issue mentioned
625 * in GetSharedMemName is resolved.
630 * Handle teardown cases. Since Windows automatically destroys the object
631 * when no references remain, we can treat it the same as detach.
635 if (*mapped_address != NULL
636 && UnmapViewOfFile(*mapped_address) == 0)
641 errmsg(
"could not unmap shared memory segment \"%s\": %m",
645 if (*impl_private != NULL
646 && CloseHandle(*impl_private) == 0)
651 errmsg(
"could not remove shared memory segment \"%s\": %m",
656 *impl_private = NULL;
657 *mapped_address = NULL;
662 /* Create new segment or open an existing one for attach. */
669 /* Shifts >= the width of the type are undefined. */
671 size_high = request_size >> 32;
675 size_low = (DWORD) request_size;
677 /* CreateFileMapping might not clear the error code on success */
680 hmap = CreateFileMapping(INVALID_HANDLE_VALUE,
/* Use the pagefile */
681 NULL,
/* Default security attrs */
682 PAGE_READWRITE,
/* Memory is read/write */
683 size_high,
/* Upper 32 bits of size */
684 size_low,
/* Lower 32 bits of size */
688 if (
errcode == ERROR_ALREADY_EXISTS ||
errcode == ERROR_ACCESS_DENIED)
691 * On Windows, when the segment already exists, a handle for the
692 * existing segment is returned. We must close it before
693 * returning. However, if the existing segment is created by a
694 * service, then it returns ERROR_ACCESS_DENIED. We don't do
695 * _dosmaperr here, so errno won't be modified.
707 errmsg(
"could not create shared memory segment \"%s\": %m",
714 hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
715 FALSE,
/* do not inherit the name */
716 name);
/* name of mapping object */
722 errmsg(
"could not open shared memory segment \"%s\": %m",
729 address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
736 /* Back out what's already been done. */
743 errmsg(
"could not map shared memory segment \"%s\": %m",
749 * VirtualQuery gives size in page_size units, which is 4K for Windows. We
750 * need size only when we are attaching, but it's better to get the size
751 * when creating new segment to keep size consistent both for
752 * DSM_OP_CREATE and DSM_OP_ATTACH.
754 if (VirtualQuery(address, &info,
sizeof(info)) == 0)
759 /* Back out what's already been done. */
761 UnmapViewOfFile(address);
767 errmsg(
"could not stat shared memory segment \"%s\": %m",
772 *mapped_address = address;
773 *mapped_size = info.RegionSize;
774 *impl_private = hmap;
782 * Operating system primitives to support mmap-based shared memory.
784 * Calling this "shared memory" is somewhat of a misnomer, because what
785 * we're really doing is creating a bunch of files and mapping them into
786 * our address space. The operating system may feel obliged to
787 * synchronize the contents to disk even if nothing is being paged out,
788 * which will not serve us well. The user can relocate the pg_dynshmem
789 * directory to a ramdisk to avoid this problem, if available.
793 void **impl_private,
void **mapped_address,
Size *mapped_size,
804 /* Handle teardown cases. */
807 if (*mapped_address != NULL
808 && munmap(*mapped_address, *mapped_size) != 0)
812 errmsg(
"could not unmap shared memory segment \"%s\": %m",
816 *mapped_address = NULL;
822 errmsg(
"could not remove shared memory segment \"%s\": %m",
829 /* Create new segment or open an existing one for attach. */
830 flags = O_RDWR | (op ==
DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
836 errmsg(
"could not open shared memory segment \"%s\": %m",
842 * If we're attaching the segment, determine the current size; if we are
843 * creating the segment, set the size to the requested value.
853 /* Back out what's already been done. */
860 errmsg(
"could not stat shared memory segment \"%s\": %m",
869 * Allocate a buffer full of zeros.
871 * Note: palloc zbuffer, instead of just using a local char array, to
872 * ensure it is reasonably well-aligned; this may save a few cycles
873 * transferring data to the kernel.
880 * Zero-fill the file. We have to do this the hard way to ensure that
881 * all the file space has really been allocated, so that we don't
882 * later seg fault when accessing the memory mapping. This is pretty
892 if (
write(
fd, zbuffer, goal) == goal)
903 /* Back out what's already been done. */
907 errno = save_errno ? save_errno : ENOSPC;
911 errmsg(
"could not resize shared memory segment \"%s\" to %zu bytes: %m",
912 name, request_size)));
918 address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
924 /* Back out what's already been done. */
933 errmsg(
"could not map shared memory segment \"%s\": %m",
937 *mapped_address = address;
938 *mapped_size = request_size;
944 errmsg(
"could not close shared memory segment \"%s\": %m",
954 * Implementation-specific actions that must be performed when a segment is to
955 * be preserved even when no backend has it attached.
957 * Except on Windows, we don't need to do anything at all. But since Windows
958 * cleans up segments automatically when no references remain, we duplicate
959 * the segment handle into the postmaster process. The postmaster needn't
960 * do anything to receive the handle; Windows transfers it automatically.
964 void **impl_private_pm_handle)
968#ifdef USE_DSM_WINDOWS
974 if (!DuplicateHandle(GetCurrentProcess(), impl_private,
975 PostmasterHandle, &hmap, 0, FALSE,
976 DUPLICATE_SAME_ACCESS))
984 errmsg(
"could not duplicate handle for \"%s\": %m",
989 * Here, we remember the handle that we created in the
990 * postmaster process. This handle isn't actually usable in
991 * any process other than the postmaster, but that doesn't
992 * matter. We're just holding onto it so that, if the segment
993 * is unpinned, dsm_impl_unpin_segment can close it.
995 *impl_private_pm_handle = hmap;
1005 * Implementation-specific actions that must be performed when a segment is no
1006 * longer to be preserved, so that it will be cleaned up when all backends
1007 * have detached from it.
1009 * Except on Windows, we don't need to do anything at all. For Windows, we
1010 * close the extra handle that dsm_impl_pin_segment created in the
1011 * postmaster's process space.
1018#ifdef USE_DSM_WINDOWS
1022 if (*impl_private &&
1023 !DuplicateHandle(PostmasterHandle, *impl_private,
1024 NULL, NULL, 0, FALSE,
1025 DUPLICATE_CLOSE_SOURCE))
1033 errmsg(
"could not duplicate handle for \"%s\": %m",
1037 *impl_private = NULL;
1049 if (errno == EFBIG || errno == ENOMEM)
1050 return errcode(ERRCODE_OUT_OF_MEMORY);
void dsm_impl_pin_segment(dsm_handle handle, void *impl_private, void **impl_private_pm_handle)
int min_dynamic_shared_memory
static int errcode_for_dynamic_shared_memory(void)
#define SEGMENT_NAME_PREFIX
void dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
static int dsm_impl_posix_resize(int fd, off_t size)
bool dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
int dynamic_shared_memory_type
static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
const struct config_enum_entry dynamic_shared_memory_options[]
static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, void **impl_private, void **mapped_address, Size *mapped_size, int elevel)
#define DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE
#define PG_DYNSHMEM_MMAP_FILE_PREFIX
int errcode_for_file_access(void)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
int CloseTransientFile(int fd)
void ReleaseExternalFD(void)
void ReserveExternalFD(void)
int OpenTransientFile(const char *fileName, int fileFlags)
#define PG_FILE_MODE_OWNER
Assert(PointerIsAligned(start, uint64))
void * MemoryContextAlloc(MemoryContext context, Size size)
void pfree(void *pointer)
void * palloc0(Size size)
MemoryContext TopMemoryContext
static int fd(const char *x, int i)
static void pgstat_report_wait_start(uint32 wait_event_info)
static void pgstat_report_wait_end(void)
void _dosmaperr(unsigned long)