1/*-------------------------------------------------------------------------
4 * Management of large buffered temporary files.
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
10 * src/backend/storage/file/buffile.c
14 * BufFiles provide a very incomplete emulation of stdio atop virtual Files
15 * (as managed by fd.c). Currently, we only support the buffered-I/O
16 * aspect of stdio: a read or write of the low-level File occurs only
17 * when the buffer is filled or emptied. This is an even bigger win
18 * for virtual Files than for ordinary kernel files, since reducing the
19 * frequency with which a virtual File is touched reduces "thrashing"
20 * of opening/closing file descriptors.
22 * Note that BufFile structs are allocated with palloc(), and therefore
23 * will go away automatically at query/transaction end. Since the underlying
24 * virtual Files are made with OpenTemporaryFile, all resources for
25 * the file are certain to be cleaned up even if processing is aborted
26 * by ereport(ERROR). The data structures required are made in the
27 * palloc context that was current when the BufFile was created, and
28 * any external resources such as temp files are owned by the ResourceOwner
29 * that was current at that time.
31 * BufFile also supports temporary files that exceed the OS file size limit
32 * (by opening multiple fd.c temporary files). This is an essential feature
33 * for sorts and hashjoins on large amounts of data.
35 * BufFile supports temporary files that can be shared with other backends, as
36 * infrastructure for parallel execution. Such files need to be created as a
37 * member of a SharedFileSet that all participants are attached to.
39 * BufFile also supports temporary files that can be used by the single backend
40 * when the corresponding files need to be survived across the transaction and
41 * need to be opened and closed multiple times. Such files need to be created
42 * as a member of a FileSet.
43 *-------------------------------------------------------------------------
58 * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
59 * The reason is that we'd like large BufFiles to be spread across multiple
60 * tablespaces when available.
62 #define MAX_PHYSICAL_FILESIZE 0x40000000
63 #define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ)
66 * This data structure represents a buffered file that consists of one or
67 * more physical files (each accessed through a virtual file descriptor
72 int numFiles;
/* number of physical files in set */
73 /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
74 File *
files;
/* palloc'd array with numFiles entries */
77 bool dirty;
/* does buffer need to be written? */
78 bool readOnly;
/* has the file been set to read only? */
81 const char *
name;
/* name of fileset based BufFile */
84 * resowner is the ResourceOwner to use for underlying temp files. (We
85 * don't need to remember the memory context we're using explicitly,
86 * because after creation we only repalloc our arrays larger.)
91 * "current pos" is position of start of buffer within the logical file.
92 * Position as seen by user of BufFile is (curFile, curOffset + pos).
94 int curFile;
/* file index (0..n) part of current pos */
96 int pos;
/* next read/write position in buffer */
97 int nbytes;
/* total # of valid bytes in buffer */
100 * XXX Should ideally use PGIOAlignedBlock, but might need a way to avoid
101 * wasting per-file alignment padding when some users create many files.
115 * Create BufFile and perform the common initialization.
135 * Create a BufFile given the first underlying physical file.
136 * NOTE: caller must set isInterXact if appropriate.
144 file->
files[0] = firstfile;
153 * Add another component temp file.
161 /* Be sure to associate the file with the BufFile's resource owner */
181 * Create a BufFile for a new temporary file (which will expand to become
182 * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
185 * If interXact is true, the temp file will not be automatically deleted
186 * at end of transaction.
188 * Note: if interXact is true, the caller had better be calling us in a
189 * memory context, and with a resource owner, that will survive across
190 * transaction boundaries.
199 * Ensure that temp tablespaces are set up for OpenTemporaryFile to use.
200 * Possibly the caller will have done this already, but it seems useful to
201 * double-check here. Failure to do this at all would result in the temp
202 * files always getting placed in the default tablespace, which is a
203 * pretty hard-to-detect bug. Callers may prefer to do it earlier if they
204 * want to be sure that any required catalog access is done in some other
219 * Build the name for a given segment of a given BufFile.
228 * Create a new segment file backing a fileset based BufFile.
237 * It is possible that there are files left over from before a crash
238 * restart with the same name. In order for BufFileOpenFileSet() not to
239 * get confused about how many segments there are, we'll unlink the next
240 * segment number if it already exists.
245 /* Create the new segment. */
249 /* FileSetCreate would've errored out */
256 * Create a BufFile that can be discovered and opened read-only by other
257 * backends that are attached to the same SharedFileSet using the same name.
259 * The naming scheme for fileset based BufFiles is left up to the calling code.
260 * The name will appear as part of one or more filenames on disk, and might
261 * provide clues to administrators about which subsystem is generating
262 * temporary file data. Since each SharedFileSet object is backed by one or
263 * more uniquely named temporary directory, names don't conflict with
264 * unrelated SharedFileSet objects.
282 * Open a file that was previously created in another backend (or this one)
283 * with BufFileCreateFileSet in the same FileSet using the same name.
284 * The backend that created the file must have called BufFileClose() or
285 * BufFileExportFileSet() to make sure that it is ready to be opened by other
286 * backends and render it read-only. If missing_ok is true, which indicates
287 * that missing files can be safely ignored, then return NULL if the BufFile
288 * with the given name is not found, otherwise, throw an error.
303 * We don't know how many segments there are, so we'll probe the
304 * filesystem to find out.
308 /* See if we need to expand our file segment array. */
309 if (nfiles + 1 > capacity)
314 /* Try to load a segment. */
317 if (files[nfiles] <= 0)
325 * If we didn't find any files at all, then no BufFile exists with this
330 /* free the memory */
338 errmsg(
"could not open temporary file \"%s\" from BufFile \"%s\": %m",
339 segment_name,
name)));
352 * Delete a BufFile that was created by BufFileCreateFileSet in the given
353 * FileSet using the given name.
355 * It is not necessary to delete files explicitly with this function. It is
356 * provided only as a way to delete files proactively, rather than waiting for
357 * the FileSet to be cleaned up.
359 * Only one backend should attempt to delete a given name, and should know
360 * that it exists and has been exported or closed otherwise missing_ok should
371 * We don't know how many segments the file has. We'll keep deleting
372 * until we run out. If we don't manage to find even an initial segment,
386 if (!found && !missing_ok)
387 elog(
ERROR,
"could not delete unknown BufFile \"%s\"",
name);
391 * BufFileExportFileSet --- flush and make read-only, in preparation for sharing.
396 /* Must be a file belonging to a FileSet. */
399 /* It's probably a bug if someone calls this twice. */
409 * Like fclose(), this also implicitly FileCloses the underlying File.
416 /* flush any unwritten data */
418 /* close and delete the underlying file(s) */
421 /* release the buffer space */
429 * Load some data into buffer, if possible, starting from curOffset.
430 * At call, must have dirty = false, pos and nbytes = 0.
431 * On exit, nbytes is number of bytes loaded.
441 * Advance to next component file if necessary and possible.
458 * Read whatever we can get, up to a full bufferload.
464 WAIT_EVENT_BUFFILE_READ);
470 errmsg(
"could not read file \"%s\": %m",
480 /* we choose not to advance curOffset here */
489 * Dump buffer contents starting at curOffset.
490 * At call, should have dirty = true, nbytes > 0.
491 * On exit, dirty is cleared if successful write, and curOffset is advanced.
501 * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
502 * crosses a component-file boundary; so we need a loop.
504 while (wpos < file->nbytes)
511 * Advance to next component file if necessary and possible.
522 * Determine how much we need to write into this file.
527 if ((off_t) bytestowrite > availbytes)
528 bytestowrite = (int) availbytes;
541 WAIT_EVENT_BUFFILE_WRITE);
542 if (bytestowrite <= 0)
545 errmsg(
"could not write to file \"%s\": %m",
555 wpos += bytestowrite;
562 * At this point, curOffset has been advanced to the end of the buffer,
563 * ie, its original value + nbytes. We need to make it point to the
564 * logical file position, ie, original value + pos, in case that is less
565 * (as could happen due to a small backwards seek in a dirty buffer!)
568 if (file->
curOffset < 0)
/* handle possible segment crossing */
576 * Now we can set the buffer empty without changing the logical position
583 * BufFileRead variants
585 * Like fread() except we assume 1-byte element size and report I/O errors via
588 * If 'exact' is true, then an error is also raised if the number of bytes
589 * read is not exactly 'size' (no short reads). If 'exact' and 'eofOK' are
590 * true, then reading zero bytes is ok.
595 size_t start_size = size;
605 /* Try to load more data into buffer. */
611 break;
/* no more data available */
615 if (nthistime > size)
621 file->
pos += nthistime;
622 ptr = (
char *) ptr + nthistime;
628 (nread != start_size && !(nread == 0 && eofOK)))
632 errmsg(
"could not read from file set \"%s\": read only %zu of %zu bytes",
633 file->
name, nread, start_size) :
634 errmsg(
"could not read from temporary file: read only %zu of %zu bytes",
641 * Legacy interface where the caller needs to check for end of file or short
651 * Require read of exactly the specified size.
660 * Require read of exactly the specified size, but optionally allow end of
661 * file (in which case 0 is returned).
672 * Like fwrite() except we assume 1-byte element size and report errors via
684 if (file->
pos >= BLCKSZ)
686 /* Buffer full, dump it out */
691 /* Hmm, went directly from reading to writing? */
698 nthistime = BLCKSZ - file->
pos;
699 if (nthistime > size)
706 file->
pos += nthistime;
709 ptr = (
const char *) ptr + nthistime;
717 * Like fflush(), except that I/O errors are reported with ereport().
731 * Like fseek(), except that target position needs two values in order to
732 * work when logical filesize exceeds maximum value representable by off_t.
733 * We do not support relative seeks across more than that, however.
734 * I/O errors are reported by ereport().
736 * Result is 0 if OK, EOF if not. Logical position is not moved if an
737 * impossible seek is attempted.
756 * Relative seek considers only the signed offset, ignoring
757 * fileno. Note that large offsets (> 1 GB) risk overflow in this
758 * add, unless we have 64-bit off_t.
766 * The file size of the last file gives us the end offset of that
774 errmsg(
"could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
779 elog(
ERROR,
"invalid whence: %d", whence);
782 while (newOffset < 0)
788 if (newFile == file->
curFile &&
790 newOffset <= file->curOffset + file->
nbytes)
793 * Seek is to a point within existing buffer; we can just adjust
794 * pos-within-buffer, without flushing buffer. Note this is OK
795 * whether reading or writing, but buffer remains dirty if we were
801 /* Otherwise, must reposition buffer, so flush any dirty data */
805 * At this point and no sooner, check for seek past last segment. The
806 * above flush could have created a new segment, so checking sooner would
807 * not work (at least not with this code).
810 /* convert seek to "start of next seg" to "end of last seg" */
811 if (newFile == file->
numFiles && newOffset == 0)
840 * BufFileSeekBlock --- block-oriented seek
842 * Performs absolute seek to the start of the n'th BLCKSZ-sized block of
843 * the file. Note that users of this interface will fail if their files
844 * exceed BLCKSZ * PG_INT64_MAX bytes, but that is quite a lot; we don't
845 * work with tables bigger than that, either...
847 * Result is 0 if OK, EOF if not. Logical position is not moved if an
848 * impossible seek is attempted.
860 * Returns the amount of data in the given BufFile, in bytes.
862 * Returned value includes the size of any holes left behind by BufFileAppend.
863 * ereport()s on failure.
870 /* Get the size of the last physical file. */
872 if (lastFileSize < 0)
875 errmsg(
"could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
884 * Append the contents of the source file to the end of the target file.
886 * Note that operation subsumes ownership of underlying resources from
887 * "source". Caller should never call BufFileClose against source having
888 * called here first. Resource owners for source and target must match,
891 * This operation works by manipulating lists of segment files, so the
892 * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
893 * boundary, typically creating empty holes before the boundary. These
894 * areas do not contain any interesting data, and cannot be read from by
897 * Returns the block number within target where the contents of source
898 * begins. Caller should apply this as an offset when working off block
899 * positions that are in terms of the original BufFile space.
912 elog(
ERROR,
"could not append BufFile with non-matching resource owner");
924 * Truncate a BufFile created by BufFileCreateFileSet up to the given fileno
931 int newFile = fileno;
937 * Loop over all the files up to the given fileno and remove the files
938 * that are greater than the fileno and truncate the given file up to the
939 * offset. Note that we also remove the given fileno if the offset is 0
940 * provided it is not the first file in which we truncate it.
944 if ((
i != fileno || offset == 0) &&
i != 0)
951 errmsg(
"could not delete fileset \"%s\": %m",
957 * This is required to indicate that we have deleted the given
966 WAIT_EVENT_BUFFILE_TRUNCATE) < 0)
969 errmsg(
"could not truncate file \"%s\": %m",
978 * If the truncate point is within existing buffer then we can just adjust
981 if (newFile == file->
curFile &&
983 newOffset <= file->curOffset + file->
nbytes)
985 /* No need to reset the current pos if the new pos is greater. */
986 if (newOffset <= file->curOffset + file->
pos)
989 /* Adjust the nbytes for the current buffer. */
992 else if (newFile == file->
curFile &&
993 newOffset < file->curOffset)
996 * The truncate point is within the existing file but prior to the
997 * current position, so we can forget the current buffer and reset the
1004 else if (newFile < file->curFile)
1007 * The truncate point is prior to the current file, so need to reset
1008 * the current position accordingly.
1015 /* Nothing to do, if the truncate point is beyond current file. */
void PrepareTempTablespaces(void)
BufFile * BufFileOpenFileSet(FileSet *fileset, const char *name, int mode, bool missing_ok)
int BufFileSeekBlock(BufFile *file, int64 blknum)
void BufFileExportFileSet(BufFile *file)
size_t BufFileRead(BufFile *file, void *ptr, size_t size)
void BufFileReadExact(BufFile *file, void *ptr, size_t size)
static void FileSetSegmentName(char *name, const char *buffile_name, int segment)
static BufFile * makeBufFileCommon(int nfiles)
BufFile * BufFileCreateTemp(bool interXact)
static void BufFileLoadBuffer(BufFile *file)
static File MakeNewFileSetSegment(BufFile *buffile, int segment)
void BufFileTell(BufFile *file, int *fileno, off_t *offset)
static void extendBufFile(BufFile *file)
#define MAX_PHYSICAL_FILESIZE
static void BufFileFlush(BufFile *file)
void BufFileWrite(BufFile *file, const void *ptr, size_t size)
size_t BufFileReadMaybeEOF(BufFile *file, void *ptr, size_t size, bool eofOK)
void BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset)
BufFile * BufFileCreateFileSet(FileSet *fileset, const char *name)
int BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
int64 BufFileSize(BufFile *file)
static BufFile * makeBufFile(File firstfile)
static size_t BufFileReadCommon(BufFile *file, void *ptr, size_t size, bool exact, bool eofOK)
void BufFileClose(BufFile *file)
int64 BufFileAppend(BufFile *target, BufFile *source)
void BufFileDeleteFileSet(FileSet *fileset, const char *name, bool missing_ok)
static void BufFileDumpBuffer(BufFile *file)
int errcode_for_file_access(void)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
char * FilePathName(File file)
void FileClose(File file)
File OpenTemporaryFile(bool interXact)
off_t FileSize(File file)
int FileTruncate(File file, off_t offset, uint32 wait_event_info)
static ssize_t FileRead(File file, void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
static ssize_t FileWrite(File file, const void *buffer, size_t amount, off_t offset, uint32 wait_event_info)
File FileSetOpen(FileSet *fileset, const char *name, int mode)
bool FileSetDelete(FileSet *fileset, const char *name, bool error_on_failure)
File FileSetCreate(FileSet *fileset, const char *name)
Assert(PointerIsAligned(start, uint64))
#define INSTR_TIME_SET_CURRENT(t)
#define INSTR_TIME_SET_ZERO(t)
#define INSTR_TIME_ACCUM_DIFF(x, y, z)
BufferUsage pgBufferUsage
char * pstrdup(const char *in)
void * repalloc(void *pointer, Size size)
void pfree(void *pointer)
#define CHECK_FOR_INTERRUPTS()
static PgChecksumMode mode
static rewind_source * source
ResourceOwner CurrentResourceOwner
instr_time temp_blk_write_time
instr_time temp_blk_read_time