1/*-------------------------------------------------------------------------
4 * A data structure for keeping track of files that have changed.
6 * This source file contains the logic to decide what to do with different
7 * kinds of files, and the data structure to support it. Before modifying
8 * anything, pg_rewind collects information about all the files and their
9 * attributes in the target and source data directories. It also scans the
10 * WAL log in the target, and collects information about data blocks that
11 * were changed. All this information is stored in a hash table, using the
12 * file path relative to the root of the data directory as the key.
14 * After collecting all the information required, the decide_file_actions()
15 * function scans the hash table and decides what action needs to be taken
16 * for each file. Finally, it sorts the array to the final order that the
17 * actions should be executed in.
19 * Copyright (c) 2013-2025, PostgreSQL Global Development Group
21 *-------------------------------------------------------------------------
30#include "catalog/pg_tablespace_d.h"
39 * Define a hash table which we can use to store information about the files
40 * appearing in source and target systems.
42#define SH_PREFIX filehash
43#define SH_ELEMENT_TYPE file_entry_t
44#define SH_KEY_TYPE const char *
46#define SH_HASH_KEY(tb, key) hash_string(key)
47#define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0)
48#define SH_SCOPE static inline
49#define SH_RAW_ALLOCATOR pg_malloc0
54 #define FILEHASH_INITIAL_SIZE 1000
66 * A separate hash table which tracks WAL files that must not be deleted.
74 #define SH_PREFIX keepwal
75 #define SH_ELEMENT_TYPE keepwal_entry
76 #define SH_KEY_TYPE const char *
78 #define SH_HASH_KEY(tb, key) hash_string(key)
79 #define SH_EQUAL(tb, a, b) (strcmp(a, b) == 0)
80 #define SH_SCOPE static inline
81 #define SH_RAW_ALLOCATOR pg_malloc0
86 #define KEEPWAL_INITIAL_SIZE 1000
97 * Definition of one element part of an exclusion list, used to exclude
98 * contents when rewinding. "name" is the name of the file or path to
99 * check for exclusion. If "match_prefix" is true, any items matching
100 * the name as prefix are excluded.
109 * The contents of these directories are removed or recreated during server
110 * start so they are not included in data processed by pg_rewind.
112 * Note: those lists should be kept in sync with what basebackup.c provides.
113 * Some of the values, contrary to what basebackup.c uses, are hardcoded as
114 * they are defined in backend-only headers. So this list is maintained
115 * with a best effort in mind.
120 * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped
121 * because extensions like pg_stat_statements store data there.
123 "pg_stat_tmp",
/* defined as PG_STAT_TMP_DIR */
126 * It is generally not useful to backup the contents of this directory
127 * even if the intention is to restore to another primary. See backup.sgml
128 * for a more detailed description.
130 "pg_replslot",
/* defined as PG_REPLSLOT_DIR */
132 /* Contents removed on startup, see dsm_cleanup_for_mmap(). */
133 "pg_dynshmem",
/* defined as PG_DYNSHMEM_DIR */
135 /* Contents removed on startup, see AsyncShmemInit(). */
139 * Old contents are loaded for possible debugging but are not required for
140 * normal operation, see SerialInit().
144 /* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
147 /* Contents zeroed on startup, see StartupSUBTRANS(). */
155 * List of files excluded from filemap processing. Files are excluded
156 * if their prefix match.
160 /* Skip auto conf temporary file. */
161 {
"postgresql.auto.conf.tmp",
false},
/* defined as PG_AUTOCONF_FILENAME */
163 /* Skip current log file temporary file */
164 {
"current_logfiles.tmp",
false},
/* defined as
165 * LOG_METAINFO_DATAFILE_TMP */
167 /* Skip relation cache because it is rebuilt on startup */
168 {
"pg_internal.init",
true},
/* defined as RELCACHE_INIT_FILENAME */
171 * If there is a backup_label or tablespace_map file, it indicates that a
172 * recovery failed and this cluster probably can't be rewound, but exclude
173 * them anyway if they are found.
175 {
"backup_label",
false},
/* defined as BACKUP_LABEL_FILE */
176 {
"tablespace_map",
false},
/* defined as TABLESPACE_MAP */
179 * If there's a backup_manifest, it belongs to a backup that was used to
180 * start this server. It is *not* correct for this backup. Our
181 * backup_manifest is injected into the backup separately if users want
184 {
"backup_manifest",
false},
186 {
"postmaster.pid",
false},
187 {
"postmaster.opts",
false},
194 * Initialize the hash table for the file map.
202/* Look up entry for 'path', creating a new one if it doesn't exist */
209 entry = filehash_insert(
filehash, path, &found);
236 return filehash_lookup(
filehash, path);
240 * Initialize a hash table to store WAL file names that must be kept.
245 /* An initial hash size out of thin air */
249/* Mark the given file to prevent its removal */
256 /* Should only be called with keepwal initialized */
259 entry = keepwal_insert(
keepwal, path, &found);
265/* Return true if file is marked as not to be removed, false otherwise */
269 return keepwal_lookup(
keepwal, path) != NULL;
273 * Callback for processing source file list.
275 * This is called once for every file in the source server. We record the
276 * type and size of the file, so that decide_file_action() can later decide what
281 const char *link_target)
286 * Pretend that pg_wal is a directory, even if it's really a symlink. We
287 * don't want to mess with the symlink itself, nor complain if it's a
288 * symlink in source but not in target or vice versa.
294 * sanity check: a filename that looks like a data file better be a
298 pg_fatal(
"data file \"%s\" in source is not a regular file", path);
300 /* Remember this source file */
303 pg_fatal(
"duplicate source file \"%s\"", path);
311 * Callback for processing target file list.
313 * Record the type and size of the file, like process_source_file() does.
317 const char *link_target)
322 * Do not apply any exclusion filters here. This has advantage to remove
323 * from the target data folder all paths which have been filtered out from
324 * the source data folder when processing the source files.
328 * Like in process_source_file, pretend that pg_wal is always a directory.
333 /* Remember this target file */
336 pg_fatal(
"duplicate source file \"%s\"", path);
344 * This callback gets called while we read the WAL in the target, for every
345 * block that has changed in the target system. It decides if the given
346 * 'blkno' in the target relfile needs to be overwritten from the source, and
347 * if so, records it in 'target_pages_to_overwrite' bitmap.
349 * NOTE: All the files on both systems must have already been added to the
361 segno = blkno / RELSEG_SIZE;
362 blkno_inseg = blkno % RELSEG_SIZE;
369 * If the block still exists in both systems, remember it. Otherwise we
370 * can safely ignore it.
372 * If the block is beyond the EOF in the source system, or the file
373 * doesn't exist in the source at all, we're going to truncate/remove it
374 * away from the target anyway. Likewise, if it doesn't exist in the
375 * target anymore, we will copy it over with the "tail" from the source
378 * It is possible to find WAL for a file that doesn't exist on either
379 * system anymore. It means that the relation was dropped later in the
380 * target system, and independently on the source system too, or that it
381 * was created and dropped in the target system and it never existed in
382 * the source. Either way, we can safely ignore it.
391 pg_fatal(
"unexpected page modification for non-regular file \"%s\"",
398 end_offset = (blkno_inseg + 1) * BLCKSZ;
399 if (end_offset <= entry->source_size && end_offset <= entry->target_size)
407 * Is this the path of file that pg_rewind can skip copying?
417 * Skip all temporary files, .../pgsql_tmp/... and .../pgsql_tmp.*
425 /* check individual files... */
441 pg_log_debug(
"entry \"%s\" excluded from source file list",
444 pg_log_debug(
"entry \"%s\" excluded from target file list",
451 * ... And check some directories. Note that this includes any contents
452 * within the directories themselves.
456 snprintf(localpath,
sizeof(localpath),
"%s/",
458 if (strstr(path, localpath) == path)
461 pg_log_debug(
"entry \"%s\" excluded from source file list",
464 pg_log_debug(
"entry \"%s\" excluded from target file list",
497 * Calculate the totals needed for progress reports.
563 * Does it look like a relation data file?
565 * For our purposes, only files belonging to the main fork are considered
566 * relation files. Other forks are always copied in toto, because we cannot
567 * reliably track changes to them, because WAL only contains block references
579 * Relation data files can be in one of the following directories:
585 * regular relations, default tablespace
587 * pg_tblspc/<tblspc oid>/<tblspc version>/
588 * within a non-default tablespace (the name of the directory
589 * depends on version)
591 * And the relation data files themselves have a filename like:
593 * <oid>.<segment number>
603 nmatch = sscanf(path,
"global/%u.%u", &rlocator.
relNumber, &segNo);
604 if (nmatch == 1 || nmatch == 2)
606 rlocator.
spcOid = GLOBALTABLESPACE_OID;
612 nmatch = sscanf(path,
"base/%u/%u.%u",
614 if (nmatch == 2 || nmatch == 3)
616 rlocator.
spcOid = DEFAULTTABLESPACE_OID;
624 if (nmatch == 3 || nmatch == 4)
630 * The sscanf tests above can match files that have extra characters at
631 * the end. To eliminate such cases, cross-check that GetRelationPath
632 * creates the exact same filename, when passed the RelFileLocator
633 * information we extracted from the filename.
639 if (strcmp(check_path, path) != 0)
649 * A helper function to create the path of a relation file and segment.
651 * The returned path is palloc'd
670 * In the final stage, the filemap is sorted so that removals come last.
671 * From disk space usage point of view, it would be better to do removals
672 * first, but for now, safety first. If a whole directory is deleted, all
673 * files and subdirectories inside it need to removed first. On creation,
674 * parent directory needs to be created before files and directories inside
675 * it. To achieve that, the file_action_t enum is ordered so that we can
676 * just sort on that first. Furthermore, sort REMOVE entries in reverse
677 * path order, so that "foo/bar" subdirectory is removed before "foo".
685 if (
fa->action >
fb->action)
687 if (
fa->action <
fb->action)
691 return strcmp(
fb->path,
fa->path);
693 return strcmp(
fa->path,
fb->path);
697 * Decide what action to perform to a file.
702 const char *path = entry->
path;
705 * Don't touch the control file. It is handled specially, after copying
706 * all the other files.
711 /* Skip macOS system files */
712 if (strstr(path,
".DS_Store") != NULL)
716 * Remove all files matching the exclusion filters in the target.
727 * Handle cases where the file is missing from one of the systems.
732 * File exists in source, but not in target. Copy it in toto. (If it's
733 * a relation data file, WAL replay after rewinding should re-create
734 * it anyway. But there's no harm in copying it now.)
751 * For files that exist in target but not in source, we check the
752 * keepwal hash table; any files listed therein must not be removed.
756 pg_log_debug(
"Not removing file \"%s\" because it is required for recovery", path);
764 * Doesn't exist in either server. Why does it have an entry in the
772 * Otherwise, the file exists on both systems
778 /* But it's a different kind of object. Strange.. */
779 pg_fatal(
"file \"%s\" is of different type in source and target", entry->
path);
783 * PG_VERSION files should be identical on both systems, but avoid
784 * overwriting them for paranoia.
797 * XXX: Should we check if it points to the same target?
805 * It's a non-data file that we have no special processing
806 * for. Copy it in toto.
813 * It's a data file that exists in both systems.
815 * If it's larger in target, we can truncate it. There will
816 * also be a WAL record of the truncation in the source
817 * system, so WAL replay would eventually truncate the target
818 * too, but we might as well do it now.
820 * If it's smaller in the target, it means that it has been
821 * truncated in the target, or enlarged in the source, or
822 * both. If it was truncated in the target, we need to copy
823 * the missing tail from the source system. If it was enlarged
824 * in the source system, there will be WAL records in the
825 * source system for the new blocks, so we wouldn't need to
826 * copy them here. But we don't know which scenario we're
827 * dealing with, and there's no harm in copying the missing
828 * blocks now, so do it now.
830 * If it's the same size, do nothing here. Any blocks modified
831 * in the target will be copied based on parsing the target
832 * system's WAL, and any blocks modified in the source will be
833 * updated after rewinding, when the source system's WAL is
846 pg_fatal(
"unknown file type for \"%s\"", path);
851 pg_fatal(
"could not decide what to do with file \"%s\"", path);
855 * Decide what to do with each file.
857 * Returns a 'filemap' with the entries in the order that their actions
858 * should be executed.
864 filehash_iterator it;
868 filehash_start_iterate(
filehash, &it);
869 while ((entry = filehash_iterate(
filehash, &it)) != NULL)
875 * Turn the hash table into an array, and sort in the order that the
876 * actions should be performed.
881 filehash_start_iterate(
filehash, &it);
883 while ((entry = filehash_iterate(
filehash, &it)) != NULL)
bool datapagemap_next(datapagemap_iterator_t *iter, BlockNumber *blkno)
void datapagemap_print(datapagemap_t *map)
datapagemap_iterator_t * datapagemap_iterate(datapagemap_t *map)
void datapagemap_add(datapagemap_t *map, BlockNumber blkno)
void * pg_malloc(size_t size)
char * pg_strdup(const char *in)
#define PG_TEMP_FILES_DIR
#define PG_TEMP_FILE_PREFIX
static const struct exclude_list_item excludeFiles[]
struct keepwal_entry keepwal_entry
static bool isRelDataFile(const char *path)
static bool keepwal_entry_exists(const char *path)
static const char * action_to_str(file_action_t action)
void process_source_file(const char *path, file_type_t type, size_t size, const char *link_target)
static const char *const excludeDirContents[]
void print_filemap(filemap_t *filemap)
static char * datasegpath(RelFileLocator rlocator, ForkNumber forknum, BlockNumber segno)
static filehash_hash * filehash
void process_target_file(const char *path, file_type_t type, size_t size, const char *link_target)
filemap_t * decide_file_actions(void)
void process_target_wal_block_change(ForkNumber forknum, RelFileLocator rlocator, BlockNumber blkno)
void keepwal_add_entry(const char *path)
static keepwal_hash * keepwal
static file_action_t decide_file_action(file_entry_t *entry)
static file_entry_t * lookup_filehash_entry(const char *path)
#define KEEPWAL_INITIAL_SIZE
static file_entry_t * insert_filehash_entry(const char *path)
static bool check_file_excluded(const char *path, bool is_source)
#define FILEHASH_INITIAL_SIZE
static int final_filemap_cmp(const void *a, const void *b)
void calculate_totals(filemap_t *filemap)
Assert(PointerIsAligned(start, uint64))
#define pg_log_debug(...)
char * pstrdup(const char *in)
void pfree(void *pointer)
char * last_dir_separator(const char *filename)
#define qsort(a, b, c, d)
char * psprintf(const char *fmt,...)
#define InvalidRelFileNumber
#define relpathperm(rlocator, forknum)
#define TABLESPACE_VERSION_DIRECTORY
bool pg_str_endswith(const char *str, const char *end)
char str[REL_PATH_STR_MAXLEN+1]
datapagemap_t target_pages_to_overwrite
char * source_link_target
char * target_link_target
file_entry_t * entries[FLEXIBLE_ARRAY_MEMBER]
#define XLOG_CONTROL_FILE