1/*-------------------------------------------------------------------------
3 * File-processing utility routines.
5 * Assorted utility functions to work on files.
8 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
9 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/common/file_utils.c
13 *-------------------------------------------------------------------------
36/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
37#if defined(HAVE_SYNC_FILE_RANGE)
38#define PG_FLUSH_DATA_WORKS 1
39#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
40#define PG_FLUSH_DATA_WORKS 1
44 * pg_xlog has been renamed to pg_wal in version 10.
46#define MINIMUM_VERSION_FOR_PG_WAL 100000
48static void walkdir(
const char *path,
49 int (*
action) (
const char *fname,
bool isdir),
50 bool process_symlinks,
51 const char *exclude_dir);
56 * do_syncfs -- Try to syncfs a file system
58 * Reports errors trying to open the path. syncfs() errors are fatal.
61do_syncfs(
const char *path)
65 fd = open(path, O_RDONLY, 0);
75 pg_log_error(
"could not synchronize file system for file \"%s\": %m", path);
83#endif /* HAVE_SYNCFS */
86 * Synchronize PGDATA and all its contents.
88 * We sync regular files and directories wherever they are, but we follow
89 * symlinks only for pg_wal (or pg_xlog) and immediately under pg_tblspc.
90 * Other symlinks are presumed to point at files we're not responsible for
91 * syncing, and might not have privileges to write at all.
93 * serverVersion indicates the version of the server to be sync'd.
95 * If sync_data_files is false, this function skips syncing "base/" and any
96 * other tablespace directories.
104 bool xlog_is_symlink;
108 /* handle renaming of pg_xlog to pg_wal in post-10 clusters */
114 * If pg_wal is a symlink, we'll need to recurse into it separately,
115 * because the first walkdir below will ignore it.
117 xlog_is_symlink =
false;
122 if (
lstat(pg_wal, &st) < 0)
125 xlog_is_symlink =
true;
133 pg_log_error(
"this build does not support sync method \"%s\"",
141 * On Linux, we don't have to open every single file one by
142 * one. We can use syncfs() to sync whole filesystems. We
143 * only expect filesystem boundaries to exist where we
144 * tolerate symlinks, namely pg_wal and the tablespaces, so we
145 * call syncfs() for each of those directories.
148 /* Sync the top level pgdata directory. */
151 /* If any tablespaces are configured, sync each of those. */
160 while (errno = 0, (de =
readdir(dir)) != NULL)
164 if (strcmp(de->
d_name,
".") == 0 ||
165 strcmp(de->
d_name,
"..") == 0)
181 /* If pg_wal is a symlink, process that too. */
184#endif /* HAVE_SYNCFS */
190 char *exclude_dir = NULL;
196 * If possible, hint to the kernel that we're soon going to
197 * fsync the data directory and its contents.
199#ifdef PG_FLUSH_DATA_WORKS
202 walkdir(pg_wal, pre_sync_fname,
false, NULL);
204 walkdir(pg_tblspc, pre_sync_fname,
true, NULL);
208 * Now we do the fsync()s in the same order.
210 * The main call ignores symlinks, so in addition to specially
211 * processing pg_wal if it's a symlink, pg_tblspc has to be
212 * visited separately with process_symlinks = true. Note that
213 * if there are any plain directories in pg_tblspc, they'll
214 * get fsync'd twice. That's not an expected case so we don't
215 * worry about optimizing it.
231 * Synchronize the given directory and all its contents.
233 * This is a convenient wrapper on top of walkdir() and do_syncfs().
243 pg_log_error(
"this build does not support sync method \"%s\"",
248 * On Linux, we don't have to open every single file one by
249 * one. We can use syncfs() to sync the whole filesystem.
252#endif /* HAVE_SYNCFS */
259 * If possible, hint to the kernel that we're soon going to
260 * fsync the data directory and its contents.
262#ifdef PG_FLUSH_DATA_WORKS
263 walkdir(dir, pre_sync_fname,
false, NULL);
273 * walkdir: recursively walk a directory, applying the action to each
274 * regular file and directory (including the named directory itself).
276 * If process_symlinks is true, the action and recursion are also applied
277 * to regular files and directories that are pointed to by symlinks in the
278 * given directory; otherwise symlinks are ignored. Symlinks are always
279 * ignored in subdirectories, ie we intentionally don't pass down the
280 * process_symlinks flag to recursive calls.
282 * If exclude_dir is not NULL, it specifies a directory path to skip
285 * Errors are reported but not considered fatal.
287 * See also walkdir in fd.c, which is a backend version of this logic.
291 int (*
action) (
const char *fname,
bool isdir),
292 bool process_symlinks,
293 const char *exclude_dir)
298 if (exclude_dir && strcmp(exclude_dir, path) == 0)
304 pg_log_error(
"could not open directory \"%s\": %m", path);
308 while (errno = 0, (de =
readdir(dir)) != NULL)
312 if (strcmp(de->
d_name,
".") == 0 ||
313 strcmp(de->
d_name,
"..") == 0)
329 * Errors are already reported directly by get_dirent_type(),
330 * and any remaining symlinks and unknown file types are
338 pg_log_error(
"could not read directory \"%s\": %m", path);
343 * It's important to fsync the destination directory itself as individual
344 * file fsyncs don't guarantee that the directory entry for the file is
345 * synced. Recent versions of ext4 have made the window much wider but
346 * it's been an issue for ext3 and other filesystems in the past.
348 (*action) (path,
true);
352 * Hint to the OS that it should get ready to fsync() this file, if supported
355 * Ignores errors trying to open unreadable files, and reports other errors
359pre_sync_fname(
const char *fname,
bool isdir)
361#ifdef PG_FLUSH_DATA_WORKS
368 if (errno == EACCES || (isdir && errno == EISDIR))
375 * We do what pg_flush_data() would do in the backend: prefer to use
376 * sync_file_range, but fall back to posix_fadvise. We ignore errors
377 * because this is only a hint.
379#if defined(HAVE_SYNC_FILE_RANGE)
380 (void) sync_file_range(
fd, 0, 0, SYNC_FILE_RANGE_WRITE);
381#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
382 (void) posix_fadvise(
fd, 0, 0, POSIX_FADV_DONTNEED);
384#error PG_FLUSH_DATA_WORKS should not have been defined
388#endif /* PG_FLUSH_DATA_WORKS */
393 * fsync_fname -- Try to fsync a file or directory
395 * Ignores errors trying to open unreadable files, or trying to fsync
396 * directories on systems where that isn't allowed/required. All other errors
407 * Some OSs require directories to be opened read-only whereas other
408 * systems don't allow us to fsync files opened read-only; so we need both
409 * cases here. Using O_RDWR will cause us to fail to fsync files that are
410 * not writable by our userid, but we assume that's OK.
419 * Open the file, silently ignoring errors about unreadable files (or
420 * unsupported operations, e.g. opening a directory under Windows), and
423 fd = open(fname, flags, 0);
426 if (errno == EACCES || (isdir && errno == EISDIR))
435 * Some OSes don't allow us to fsync directories at all, so we can ignore
436 * those errors. Anything else needs to be reported.
438 if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
450 * fsync_parent_path -- fsync the parent path of a file or directory
452 * This is aimed at making file operations persistent on disk in case of
453 * an OS crash or power failure.
464 * get_parent_directory() returns an empty string if the input argument is
465 * just a file name (see comments in path.c), so handle that as being the
468 if (strlen(parentpath) == 0)
478 * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
480 * Wrapper around rename, similar to the backend version.
488 * First fsync the old and target path (if it exists), to ensure that they
489 * are properly persistent on disk. Syncing the target file is not
490 * strictly necessary, but it makes it easier to reason about crashes;
491 * because it's then guaranteed that either source or target file exists
502 pg_log_error(
"could not open file \"%s\": %m", newfile);
510 pg_log_error(
"could not fsync file \"%s\": %m", newfile);
517 /* Time to do the real deal... */
518 if (rename(oldfile, newfile) != 0)
520 pg_log_error(
"could not rename file \"%s\" to \"%s\": %m",
526 * To guarantee renaming the file is persistent, fsync the file with its
527 * new name, and its containing directory.
541 * Return the type of a directory entry.
543 * In frontend code, elevel should be a level from logging.h; in backend code
544 * it should be a level from elog.h.
549 bool look_through_symlinks,
555 * Some systems tell us the type directly in the dirent struct, but that's
556 * a BSD and Linux extension not required by POSIX. Even when the
557 * interface is present, sometimes the type is unknown, depending on the
560#if defined(DT_REG) && defined(DT_DIR) && defined(DT_LNK)
565 else if (de->
d_type ==
DT_LNK && !look_through_symlinks)
579 if (look_through_symlinks)
580 sret =
stat(path, &fst);
582 sret =
lstat(path, &fst);
592 errmsg(
"could not stat file \"%s\": %m", path)));
607 * Compute what remains to be done after a possibly partial vectored read or
608 * write. The part of 'source' beginning after 'transferred' bytes is copied
609 * to 'destination', and its length is returned. 'source' and 'destination'
610 * may point to the same array, for in-place adjustment. A return value of
611 * zero indicates completion (for callers without a cheaper way to know that).
615 const struct iovec *
source,
621 /* Skip wholly transferred iovecs. */
622 while (
source->iov_len <= transferred)
624 transferred -=
source->iov_len;
628 /* All iovecs transferred? */
632 * We don't expect the kernel to transfer more than we asked it
633 * to, or something is out of sync.
640 /* Copy the remaining iovecs to the front of the array. */
641 if (
source != destination)
644 /* Adjust leading iovec, which may have been partially transferred. */
645 Assert(destination->iov_len > transferred);
646 destination->iov_base = (
char *) destination->iov_base + transferred;
647 destination->iov_len -= transferred;
653 * pg_pwritev_with_retry
655 * Convenience wrapper for pg_pwritev() that retries on partial write. If an
656 * error is returned, it is unspecified how much has been written.
665 /* We'd better have space to make a copy, in case we need to retry. */
674 /* Write as much as we can. */
679#ifdef SIMULATE_SHORT_WRITE
680 part =
Min(part, 4096);
683 /* Count our progress. */
688 * See what is left. On the first loop we used the caller's array,
689 * but in later loops we'll use our local copy that we are allowed to
694 }
while (iovcnt > 0);
702 * Writes zeros to file worth "size" bytes at "offset" (from the start of the
703 * file), using vectored I/O.
705 * Returns the total amount of data written. On failure, a negative value
706 * is returned with errno set.
714 size_t remaining_size = size;
715 ssize_t total_written = 0;
717 /* Loop, writing as many blocks as we can for each system call. */
718 while (remaining_size > 0)
723 for (; iovcnt < PG_IOV_MAX && remaining_size > 0; iovcnt++)
725 size_t this_iov_size;
727 iov[iovcnt].iov_base = zerobuf_addr;
729 if (remaining_size < BLCKSZ)
730 this_iov_size = remaining_size;
732 this_iov_size = BLCKSZ;
734 iov[iovcnt].iov_len = this_iov_size;
735 remaining_size -= this_iov_size;
744 total_written += written;
747 Assert(total_written == size);
749 return total_written;
#define unconstify(underlying_type, expr)
struct dirent * readdir(DIR *)
DIR * opendir(const char *)
int errcode_for_file_access(void)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
int durable_rename(const char *oldfile, const char *newfile, int elevel)
void fsync_fname(const char *fname, bool isdir)
static int fsync_parent_path(const char *fname, int elevel)
static void walkdir(const char *path, void(*action)(const char *fname, bool isdir, int elevel), bool process_symlinks, int elevel)
int compute_remaining_iovec(struct iovec *destination, const struct iovec *source, int iovcnt, size_t transferred)
ssize_t pg_pwrite_zeros(int fd, size_t size, off_t offset)
ssize_t pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
PGFileType get_dirent_type(const char *path, const struct dirent *de, bool look_through_symlinks, int elevel)
@ DATA_DIR_SYNC_METHOD_SYNCFS
@ DATA_DIR_SYNC_METHOD_FSYNC
Assert(PointerIsAligned(start, uint64))
static bool sync_data_files
static DataDirSyncMethod sync_method
void pg_log_generic(enum pg_log_level level, enum pg_log_part part, const char *pg_restrict fmt,...)
#define pg_log_error(...)
Datum subpath(PG_FUNCTION_ARGS)
void pfree(void *pointer)
#define MINIMUM_VERSION_FOR_PG_WAL
static ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
static rewind_source * source
void get_parent_directory(char *path)
size_t strlcpy(char *dst, const char *src, size_t siz)
static int fd(const char *x, int i)
char * psprintf(const char *fmt,...)