1/*-------------------------------------------------------------------------
4 * Implement shared memory using SysV facilities
6 * These routines used to be a fairly thin layer on top of SysV shared
7 * memory functionality. With the addition of anonymous-shmem logic,
8 * they're a bit fatter now. We still require a SysV shmem block to
9 * exist, though, because mmap'd shmem provides no way to find out how
10 * many processes are attached, which we need for interlocking purposes.
12 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
13 * Portions Copyright (c) 1994, Regents of the University of California
16 * src/backend/port/sysv_shmem.c
18 *-------------------------------------------------------------------------
43 * As of PostgreSQL 9.3, we normally allocate only a very small amount of
44 * System V shared memory, and only for the purposes of providing an
45 * interlock to protect the data directory. The real shared memory block
46 * is allocated using mmap(). This works around the problem that many
47 * systems have very low limits on the amount of System V shared memory
48 * that can be allocated. Even a limit of a few megabytes will be enough
49 * to run many copies of PostgreSQL without needing to adjust system settings.
51 * We assume that no one will attempt to run PostgreSQL 9.3 or later on
52 * systems that are ancient enough that anonymous shared memory is not
53 * supported, such as pre-2.4 versions of Linux. If that turns out to be
54 * false, we might need to add compile and/or run-time tests here and do this
55 * only if the running kernel supports it.
57 * However, we must always disable this logic in the EXEC_BACKEND case, and
58 * fall back to the old method of allocating the entire segment using System V
59 * shared memory, because there's no way to attach an anonymous mmap'd segment
60 * to a process after exec(). Since EXEC_BACKEND is intended only for
61 * developer use, this shouldn't be a big problem. Because of this, we do
62 * not worry about supporting anonymous shmem in the EXEC_BACKEND cases below.
64 * As of PostgreSQL 12, we regained the ability to use a large System V shared
65 * memory region even in non-EXEC_BACKEND builds, if shared_memory_type is set
66 * to sysv (though this is not the default).
71 typedef int IpcMemoryId;
/* shared memory ID returned by shmget(2) */
74 * How does a given IpcMemoryId relate to this PostgreSQL process?
76 * One could recycle unattached segments of different data directories if we
77 * distinguished that case from other SHMSTATE_FOREIGN cases. Doing so would
78 * cause us to visit less of the key space, making us less likely to detect a
79 * SHMSTATE_ATTACHED key. It would also complicate the concurrency analysis,
80 * in that postmasters of different data directories could simultaneously
81 * attempt to recycle a given key. We'll waste keys longer in some cases, but
82 * avoiding the problems of the alternative justifies that loss.
109 * InternalIpcMemoryCreate(memKey, size)
111 * Attempt to create a new shared memory segment with the specified key.
112 * Will fail (return NULL) if such a segment already exists. If successful,
113 * attach the segment to the current process and return its attached address.
114 * On success, callbacks are registered with on_shmem_exit to detach and
115 * delete the segment when on_shmem_exit is called.
117 * If we fail with a failure code other than collision-with-existing-segment,
118 * print out an error and abort. Other types of errors are not recoverable.
124 void *requestedAddress = NULL;
128 * Normally we just pass requestedAddress = NULL to shmat(), allowing the
129 * system to choose where the segment gets mapped. But in an EXEC_BACKEND
130 * build, it's possible for whatever is chosen in the postmaster to not
131 * work for backends, due to variations in address space layout. As a
132 * rather klugy workaround, allow the user to specify the address to use
133 * via setting the environment variable PG_SHMEM_ADDR. (If this were of
134 * interest for anything except debugging, we'd probably create a cleaner
135 * and better-documented way to set it, such as a GUC.)
139 char *pg_shmem_addr = getenv(
"PG_SHMEM_ADDR");
142 requestedAddress = (
void *) strtoul(pg_shmem_addr, NULL, 0);
145#if defined(__darwin__) && SIZEOF_VOID_P == 8
147 * Provide a default value that is believed to avoid problems with
148 * ASLR on the current macOS release.
150 requestedAddress = (
void *) 0x80000000000;
160 int shmget_errno = errno;
163 * Fail quietly if error indicates a collision with existing segment.
164 * One would expect EEXIST, given that we said IPC_EXCL, but perhaps
165 * we could get a permission violation instead? Also, EIDRM might
166 * occur if an old seg is slated for destruction but not gone yet.
168 if (shmget_errno == EEXIST || shmget_errno == EACCES
170 || shmget_errno ==
EIDRM
176 * Some BSD-derived kernels are known to return EINVAL, not EEXIST, if
177 * there is an existing segment but it's smaller than "size" (this is
178 * a result of poorly-thought-out ordering of error tests). To
179 * distinguish between collision and invalid size in such cases, we
180 * make a second try with size = 0. These kernels do not test size
181 * against SHMMIN in the preexisting-segment case, so we will not get
182 * EINVAL a second time if there is such a segment.
184 if (shmget_errno == EINVAL)
190 /* As above, fail quietly if we verify a collision */
191 if (errno == EEXIST || errno == EACCES
197 /* Otherwise, fall through to report the original error */
202 * On most platforms we cannot get here because SHMMIN is
203 * greater than zero. However, if we do succeed in creating a
204 * zero-size segment, free it and then fall through to report
205 * the original error.
207 if (shmctl(shmid,
IPC_RMID, NULL) < 0)
208 elog(
LOG,
"shmctl(%d, %d, 0) failed: %m",
214 * Else complain and abort.
216 * Note: at this point EINVAL should mean that either SHMMIN or SHMMAX
217 * is violated. SHMALL violation might be reported as either ENOMEM
218 * (BSDen) or ENOSPC (Linux); the Single Unix Spec fails to say which
219 * it should be. SHMMNI violation is ENOSPC, per spec. Just plain
220 * not-enough-RAM is ENOMEM.
222 errno = shmget_errno;
224 (
errmsg(
"could not create shared memory segment: %m"),
225 errdetail(
"Failed system call was shmget(key=%lu, size=%zu, 0%o).",
226 (
unsigned long) memKey, size,
228 (shmget_errno == EINVAL) ?
229 errhint(
"This error usually means that PostgreSQL's request for a shared memory "
230 "segment exceeded your kernel's SHMMAX parameter, or possibly that "
232 "your kernel's SHMMIN parameter.\n"
233 "The PostgreSQL documentation contains more information about shared "
234 "memory configuration.") : 0,
235 (shmget_errno == ENOMEM) ?
236 errhint(
"This error usually means that PostgreSQL's request for a shared "
237 "memory segment exceeded your kernel's SHMALL parameter. You might need "
238 "to reconfigure the kernel with larger SHMALL.\n"
239 "The PostgreSQL documentation contains more information about shared "
240 "memory configuration.") : 0,
241 (shmget_errno == ENOSPC) ?
242 errhint(
"This error does *not* mean that you have run out of disk space. "
243 "It occurs either if all available shared memory IDs have been taken, "
244 "in which case you need to raise the SHMMNI parameter in your kernel, "
245 "or because the system's overall limit for shared memory has been "
247 "The PostgreSQL documentation contains more information about shared "
248 "memory configuration.") : 0));
251 /* Register on-exit routine to delete the new segment */
254 /* OK, should be able to attach to the segment */
257 if (memAddress == (
void *) -1)
258 elog(
FATAL,
"shmat(id=%d, addr=%p, flags=0x%x) failed: %m",
261 /* Register on-exit routine to detach new segment before deleting */
265 * Store shmem key and ID in data directory lockfile. Format to try to
266 * keep it the same length always (trailing junk in the lockfile won't
267 * hurt, but might confuse humans).
273 (
unsigned long) memKey, (
unsigned long) shmid);
280/****************************************************************************/
281/* IpcMemoryDetach(status, shmaddr) removes a shared memory segment */
282/* from process' address space */
283/* (called as an on_shmem_exit callback, hence funny argument list) */
284/****************************************************************************/
288 /* Detach System V shared memory block. */
293/****************************************************************************/
294/* IpcMemoryDelete(status, shmId) deletes a shared memory segment */
295/* (called as an on_shmem_exit callback, hence funny argument list) */
296/****************************************************************************/
301 elog(
LOG,
"shmctl(%d, %d, 0) failed: %m",
306 * PGSharedMemoryIsInUse
308 * Is a previously-existing shmem segment still existing and in use?
310 * The point of this exercise is to detect the case where a prior postmaster
311 * crashed, but it left child backends that are still running. Therefore
312 * we only care about shmem segments that are associated with the intended
313 * DataDir. This is an important consideration since accidental matches of
314 * shmem segment IDs are reasonably common.
323 if (memAddress && shmdt(memAddress) < 0)
324 elog(
LOG,
"shmdt(%p) failed: %m", memAddress);
339 * Test for a segment with id shmId; see comment at IpcMemoryState.
341 * If the segment exists, we'll attempt to attach to it, using attachAt
342 * if that's not NULL (but it's best to pass NULL if possible).
344 * *addr is set to the segment memory address if we attached to it, else NULL.
351 struct shmid_ds shmStat;
358 * First, try to stat the shm segment ID, to see if it exists at all.
360 if (shmctl(shmId,
IPC_STAT, &shmStat) < 0)
363 * EINVAL actually has multiple possible causes documented in the
364 * shmctl man page, but we assume it must mean the segment no longer
371 * EACCES implies we have no read permission, which means it is not a
372 * Postgres shmem segment (or at least, not one that is relevant to
373 * our data directory).
379 * Some Linux kernel versions (in fact, all of them as of July 2007)
380 * sometimes return EIDRM when EINVAL is correct. The Linux kernel
381 * actually does not have any internal state that would justify
382 * returning EIDRM, so we can get away with assuming that EIDRM is
383 * equivalent to EINVAL on that platform.
385#ifdef HAVE_LINUX_EIDRM_BUG
391 * Otherwise, we had better assume that the segment is in use. The
392 * only likely case is (non-Linux, assumed spec-compliant) EIDRM,
393 * which implies that the segment has been IPC_RMID'd but there are
394 * still processes attached to it.
400 * Try to attach to the segment and see if it matches our data directory.
401 * This avoids any risk of duplicate-shmem-key conflicts on machines that
402 * are running several postmasters under the same userid.
404 * (When we're called from PGSharedMemoryCreate, this stat call is
405 * duplicative; but since this isn't a high-traffic case it's not worth
406 * trying to optimize.)
415 * Attachment failed. The cases we're interested in are the same as
416 * for the shmctl() call above. In particular, note that the owning
417 * postmaster could have terminated and removed the segment between
418 * shmctl() and shmat().
420 * If attachAt isn't NULL, it's possible that EINVAL reflects a
421 * problem with that address not a vanished segment, so it's best to
422 * pass NULL when probing for conflicting segments.
428#ifdef HAVE_LINUX_EIDRM_BUG
432 /* Otherwise, be conservative. */
438 hdr->device != statbuf.
st_dev ||
439 hdr->inode != statbuf.
st_ino)
442 * It's either not a Postgres segment, or not one for my data
449 * It does match our data directory, so now test whether any processes are
450 * still attached to it. (We are, now, but the shm_nattch result is from
451 * before we attached to it.)
457 * Identify the huge page size to use, and compute the related mmap flags.
459 * Some Linux kernel versions have a bug causing mmap() to fail on requests
460 * that are not a multiple of the hugepage size. Versions without that bug
461 * instead silently round the request up to the next hugepage multiple ---
462 * and then munmap() fails when we give it a size different from that.
463 * So we have to round our request up to a multiple of the actual hugepage
464 * size to avoid trouble.
466 * Doing the round-up ourselves also lets us make use of the extra memory,
467 * rather than just wasting it. Currently, we just increase the available
468 * space recorded in the shmem header, which will make the extra usable for
469 * purposes such as additional locktable entries. Someday, for very large
470 * hugepage sizes, we might want to think about more invasive strategies,
471 * such as increasing shared_buffers to absorb the extra space.
473 * Returns the (real, assumed or config provided) page size into
474 * *hugepagesize, and the hugepage-related mmap flags to use into
475 * *mmap_flags if requested by the caller. If huge pages are not supported,
476 * *hugepagesize and *mmap_flags are set to 0.
483 Size default_hugepagesize = 0;
484 Size hugepagesize_local = 0;
485 int mmap_flags_local = 0;
488 * System-dependent code to find out the default huge page size.
490 * On Linux, read /proc/meminfo looking for a line like "Hugepagesize:
491 * nnnn kB". Ignore any failures, falling back to the preset default.
503 while (fgets(
buf,
sizeof(
buf), fp))
505 if (sscanf(
buf,
"Hugepagesize: %u %c", &sz, &ch) == 2)
509 default_hugepagesize = sz * (
Size) 1024;
512 /* We could accept other units besides kB, if needed */
518#endif /* __linux__ */
522 /* If huge page size is requested explicitly, use that. */
525 else if (default_hugepagesize != 0)
527 /* Otherwise use the system default, if we have it. */
528 hugepagesize_local = default_hugepagesize;
533 * If we fail to find out the system's default huge page size, or no
534 * huge page size is requested explicitly, assume it is 2MB. This will
535 * work fine when the actual size is less. If it's more, we might get
536 * mmap() or munmap() failures due to unaligned requests; but at this
537 * writing, there are no reports of any non-Linux systems being picky
540 hugepagesize_local = 2 * 1024 * 1024;
543 mmap_flags_local = MAP_HUGETLB;
546 * On recent enough Linux, also include the explicit page size, if
549#if defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT)
550 if (hugepagesize_local != default_hugepagesize)
554 mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
558 /* assign the results found */
560 *mmap_flags = mmap_flags_local;
562 *hugepagesize = hugepagesize_local;
571#endif /* MAP_HUGETLB */
575 * GUC check_hook for huge_page_size
580#if !(defined(MAP_HUGE_MASK) && defined(MAP_HUGE_SHIFT))
581 /* Recent enough Linux only, for now. See GetHugePageSize(). */
592 * Creates an anonymous mmap()ed shared memory segment.
594 * Pass the requested size in *size. This function will modify *size to the
595 * actual size of the allocation, if it ends up allocating a segment that is
596 * larger than requested.
601 Size allocsize = *size;
606 /* PGSharedMemoryCreate should have dealt with this case */
612 * Round up the request size to a suitable large value.
619 if (allocsize % hugepagesize != 0)
620 allocsize += hugepagesize - (allocsize % hugepagesize);
622 ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
626 elog(
DEBUG1,
"mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m",
632 * Report whether huge pages are in use. This needs to be tracked before
633 * the second mmap() call if attempting to use huge pages failed
642 * Use the original size, not the rounded-up value, when falling back
646 ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE,
655 (
errmsg(
"could not map anonymous shared memory: %m"),
656 (mmap_errno == ENOMEM) ?
657 errhint(
"This error usually means that PostgreSQL's request "
658 "for a shared memory segment exceeded available memory, "
659 "swap space, or huge pages. To reduce the request size "
660 "(currently %zu bytes), reduce PostgreSQL's shared "
661 "memory usage, perhaps by reducing \"shared_buffers\" or "
662 "\"max_connections\".",
671 * AnonymousShmemDetach --- detach from an anonymous mmap'd block
672 * (called as an on_shmem_exit callback, hence funny argument list)
677 /* Release anonymous shared memory block, if any. */
681 elog(
LOG,
"munmap(%p, %zu) failed: %m",
688 * PGSharedMemoryCreate
690 * Create a shared memory segment of the given size and initialize its
691 * standard header. Also, register an on_shmem_exit callback to release
694 * Dead Postgres segments pertinent to this DataDir are recycled if found, but
695 * we do not fail upon collision with foreign shmem segments. The idea here
696 * is to detect and re-use keys that may have been assigned by a crashed
697 * postmaster or backend.
710 * We use the data directory's ID info (inode and device numbers) to
711 * positively identify shmem segments associated with this data dir, and
712 * also as seeds for searching for a free shmem key.
717 errmsg(
"could not stat data directory \"%s\": %m",
720 /* Complain if hugepages demanded but we can't possibly support them */
721#if !defined(MAP_HUGETLB)
724 (
errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
725 errmsg(
"huge pages not supported on this platform")));
728 /* For now, we don't support huge pages in SysV memory */
731 (
errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
732 errmsg(
"huge pages not supported with the current \"shared_memory_type\" setting")));
734 /* Room for a header? */
742 /* Register on-exit routine to unmap the anonymous segment */
745 /* Now we need only allocate a minimal-sized SysV shmem block. */
752 /* huge pages are only available with mmap */
758 * Loop till we find a free IPC key. Trust CreateDataDirLockFile() to
759 * ensure no more than one postmaster per data directory can enter this
760 * loop simultaneously. (CreateDataDirLockFile() does not entirely ensure
761 * that, but prefer fixing it over coping here.)
763 NextShmemSegID = statbuf.
st_ino;
771 /* Try to create new segment */
774 break;
/* successful create and attach */
776 /* Check shared memory and possibly remove and recreate */
779 * shmget() failure is typically EACCES, hence SHMSTATE_FOREIGN.
780 * ENOENT, a narrow possibility, implies SHMSTATE_ENOENT, but one can
781 * safely treat SHMSTATE_ENOENT like SHMSTATE_FOREIGN.
797 (
errcode(ERRCODE_LOCK_FILE_EXISTS),
798 errmsg(
"pre-existing shared memory block (key %lu, ID %lu) is still in use",
799 (
unsigned long) NextShmemSegID,
800 (
unsigned long) shmid),
801 errhint(
"Terminate any old server processes associated with data directory \"%s\".",
807 * To our surprise, some other process deleted since our last
808 * InternalIpcMemoryCreate(). Moments earlier, we would have
809 * seen SHMSTATE_FOREIGN. Try that same ID again.
812 "shared memory block (key %lu, ID %lu) deleted during startup",
813 (
unsigned long) NextShmemSegID,
814 (
unsigned long) shmid);
822 * The segment pertains to DataDir, and every process that had
823 * used it has died or detached. Zap it, if possible, and any
824 * associated dynamic shared memory segments, as well. This
825 * shouldn't fail, but if it does, assume the segment belongs
826 * to someone else after all, and try the next candidate.
827 * Otherwise, try again to create the segment. That may fail
828 * if some other process creates the same shmem key before we
829 * do, in which case we'll try the next key.
833 if (shmctl(shmid,
IPC_RMID, NULL) < 0)
838 if (oldhdr && shmdt(oldhdr) < 0)
839 elog(
LOG,
"shmdt(%p) failed: %m", oldhdr);
842 /* Initialize new segment. */
848 /* Fill in the data directory ID info, too */
853 * Initialize space allocation status for segment.
859 /* Save info for possible future use */
864 * If AnonymousShmem is NULL here, then we're not using anonymous shared
865 * memory, and should return a pointer to the System V shared memory
866 * block. Otherwise, the System V shared memory block is only a shim, and
867 * we must return a pointer to the real block.
878 * PGSharedMemoryReAttach
880 * This is called during startup of a postmaster child process to re-attach to
881 * an already existing shared memory segment. This is needed only in the
882 * EXEC_BACKEND case; otherwise postmaster children inherit the shared memory
883 * segment attachment via fork().
885 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
886 * routine. The caller must have already restored them to the postmaster's
901 /* cygipc (currently) appears to not detach on exec. */
913 elog(
FATAL,
"could not reattach to shared memory (key=%d, addr=%p): %m",
915 if (hdr != origUsedShmemSegAddr)
916 elog(
FATAL,
"reattaching to shared memory returned unexpected address (got %p, expected %p)",
917 hdr, origUsedShmemSegAddr);
924 * PGSharedMemoryNoReAttach
926 * This is called during startup of a postmaster child process when we choose
927 * *not* to re-attach to the existing shared memory segment. We must clean up
928 * to leave things in the appropriate state. This is not used in the non
929 * EXEC_BACKEND case, either.
931 * The child process startup logic might or might not call PGSharedMemoryDetach
932 * after this; make sure that it will be a no-op if called.
934 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
935 * routine. The caller must have already restored them to the postmaster's
945 /* cygipc (currently) appears to not detach on exec. */
949 /* For cleanliness, reset UsedShmemSegAddr to show we're not attached. */
951 /* And the same for UsedShmemSegID. */
955#endif /* EXEC_BACKEND */
958 * PGSharedMemoryDetach
960 * Detach from the shared memory segment, if still attached. This is not
961 * intended to be called explicitly by the process that originally created the
962 * segment (it will have on_shmem_exit callback(s) registered to do that).
963 * Rather, this is for subprocesses that have inherited an attachment and want
966 * UsedShmemSegID and UsedShmemSegAddr are implicit parameters to this
967 * routine, also AnonymousShmem and AnonymousShmemSize.
975#
if defined(EXEC_BACKEND) && defined(__CYGWIN__)
976 /* Work-around for cygipc exec bug */
987 elog(
LOG,
"munmap(%p, %zu) failed: %m",
void dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
struct PGShmemHeader PGShmemHeader
int errcode_for_file_access(void)
int errdetail(const char *fmt,...)
int errhint(const char *fmt,...)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
FILE * AllocateFile(const char *name, const char *mode)
void SetConfigOption(const char *name, const char *value, GucContext context, GucSource source)
#define GUC_check_errdetail
Assert(PointerIsAligned(start, uint64))
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
void AddToDataDirLockFile(int target_line, const char *str)
static uint64 pg_ceil_log2_64(uint64 num)
static rewind_source * source
#define LOCK_FILE_LINE_SHMEM_KEY
static Datum PointerGetDatum(const void *X)
static Pointer DatumGetPointer(Datum X)
static Datum Int32GetDatum(int32 X)
static int32 DatumGetInt32(Datum X)
static void AnonymousShmemDetach(int status, Datum arg)
void PGSharedMemoryDetach(void)
@ SHMSTATE_ANALYSIS_FAILURE
PGShmemHeader * PGSharedMemoryCreate(Size size, PGShmemHeader **shim)
static Size AnonymousShmemSize
unsigned long UsedShmemSegID
bool check_huge_page_size(int *newval, void **extra, GucSource source)
static void * CreateAnonymousSegment(Size *size)
static void * InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size)
void GetHugePageSize(Size *hugepagesize, int *mmap_flags)
bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2)
static void IpcMemoryDetach(int status, Datum shmaddr)
static IpcMemoryState PGSharedMemoryAttach(IpcMemoryId shmId, void *attachAt, PGShmemHeader **addr)
static void * AnonymousShmem
static void IpcMemoryDelete(int status, Datum shmId)
void PGSharedMemoryReAttach(void)
void PGSharedMemoryNoReAttach(void)