1/*-------------------------------------------------------------------------
4 * POSTGRES heap access method input/output code.
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/heap/hio.c
13 *-------------------------------------------------------------------------
28 * RelationPutHeapTuple - place tuple at specified page
30 * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! Must PANIC on failure!!!
32 * Note - caller must hold BUFFER_LOCK_EXCLUSIVE on the buffer.
44 * A tuple that's being inserted speculatively should already have its
50 * Do not allow tuples with invalid combinations of hint bits to be placed
51 * on a page. This combination is detected as corruption by the
52 * contrib/amcheck logic, so if you disable this assertion, make
53 * corresponding changes there.
58 /* Add the tuple to the page */
65 elog(
PANIC,
"failed to add tuple to page");
67 /* Update tuple->t_self to the actual position where it was stored */
71 * Insert the correct position into CTID of the stored tuple, too (unless
72 * this is a speculative insertion, in which case the token is held in
85 * Read in a buffer in mode, using bulk-insert strategy if bistate isn't NULL.
93 /* If not bulk-insert, exactly like ReadBuffer */
98 /* If we have the desired block already pinned, re-pin and return it */
104 * Currently the LOCK variants are only used for extending
105 * relation, which should never reach this branch.
113 /* ... else drop the old buffer */
118 /* Perform a read using the buffer strategy */
122 /* Save the selected block as target for future inserts */
130 * For each heap page which is all-visible, acquire a pin on the appropriate
131 * visibility map page, if we haven't already got one.
133 * To avoid complexity in the callers, either buffer1 or buffer2 may be
134 * InvalidBuffer if only one buffer is involved. For the same reason, block2
135 * may be smaller than block1.
137 * Returns whether buffer locks were temporarily released.
144 bool need_to_pin_buffer1;
145 bool need_to_pin_buffer2;
146 bool released_locks =
false;
149 * Swap buffers around to handle case of a single block/buffer, and to
150 * handle if lock ordering rules require to lock block2 first.
156 Buffer *tmpvmbuf = vmbuffer1;
160 vmbuffer1 = vmbuffer2;
164 vmbuffer2 = tmpvmbuf;
173 /* Figure out which pins we need but don't have. */
179 if (!need_to_pin_buffer1 && !need_to_pin_buffer2)
182 /* We must unlock both buffers before doing any I/O. */
183 released_locks =
true;
189 if (need_to_pin_buffer1)
191 if (need_to_pin_buffer2)
194 /* Relock buffers. */
200 * If there are two buffers involved and we pinned just one of them,
201 * it's possible that the second one became all-visible while we were
202 * busy pinning the first one. If it looks like that's a possible
203 * scenario, we'll need to make a second pass through this loop.
206 || (need_to_pin_buffer1 && need_to_pin_buffer2))
210 return released_locks;
214 * Extend the relation. By multiple pages, if beneficial.
216 * If the caller needs multiple pages (num_pages > 1), we always try to extend
217 * by at least that much.
219 * If there is contention on the extension lock, we don't just extend "for
220 * ourselves", but we try to help others. We can do so by adding empty pages
221 * into the FSM. Typically there is no contention when we can't use the FSM.
223 * We do have to limit the number of pages to extend by to some value, as the
224 * buffers for all the extended pages need to, temporarily, be pinned. For now
225 * we define MAX_BUFFERS_TO_EXTEND_BY to be 64 buffers, it's hard to see
226 * benefits with higher numbers. This partially is because copyfrom.c's
227 * MAX_BUFFERED_TUPLES / MAX_BUFFERED_BYTES prevents larger multi_inserts.
229 * Returns a buffer for a newly extended block. If possible, the buffer is
230 * returned exclusively locked. *did_unlock is set to true if the lock had to
231 * be released, false otherwise.
234 * XXX: It would likely be beneficial for some workloads to extend more
235 * aggressively, e.g. using a heuristic based on the relation size.
239 int num_pages,
bool use_fsm,
bool *did_unlock)
241#define MAX_BUFFERS_TO_EXTEND_BY 64
251 * Determine by how many pages to try to extend by.
253 if (bistate == NULL && !use_fsm)
256 * If we have neither bistate, nor can use the FSM, we can't bulk
257 * extend - there'd be no way to find the additional pages.
266 * Try to extend at least by the number of pages the caller needs. We
267 * can remember the additional pages (either via FSM or bistate).
269 extend_by_pages = num_pages;
277 * Multiply the number of pages to extend by the number of waiters. Do
278 * this even if we're not using the FSM, as it still relieves
279 * contention, by deferring the next time this backend needs to
280 * extend. In that case the extended pages will be found via
281 * bistate->next_free.
283 extend_by_pages += extend_by_pages * waitcount;
286 * If we previously extended using the same bistate, it's very likely
287 * we'll extend some more. Try to extend by as many pages as
288 * before. This can be important for performance for several reasons,
291 * - It prevents mdzeroextend() switching between extending the
292 * relation in different ways, which is inefficient for some
295 * - Contention is often intermittent. Even if we currently don't see
296 * other waiters (see above), extending by larger amounts can
297 * prevent future contention.
304 * Can't extend by more than MAX_BUFFERS_TO_EXTEND_BY, we need to pin
305 * them all concurrently.
311 * How many of the extended pages should be entered into the FSM?
313 * If we have a bistate, only enter pages that we don't need ourselves
314 * into the FSM. Otherwise every other backend will immediately try to
315 * use the pages this backend needs for itself, causing unnecessary
316 * contention. If we don't have a bistate, we can't avoid the FSM.
318 * Never enter the page returned into the FSM, we'll immediately use it.
320 if (num_pages > 1 && bistate == NULL)
321 not_in_fsm_pages = 1;
323 not_in_fsm_pages = num_pages;
325 /* prepare to put another buffer into the bistate */
333 * Extend the relation. We ask for the first returned page to be locked,
334 * so that we are sure that nobody has inserted into the page
337 * With the current MAX_BUFFERS_TO_EXTEND_BY there's no danger of
338 * [auto]vacuum trying to truncate later pages as REL_TRUNCATE_MINIMUM is
347 buffer = victim_buffers[0];
/* the buffer the function will return */
348 last_block = first_block + (extend_by_pages - 1);
352 * Relation is now extended. Initialize the page. We do this here, before
353 * potentially releasing the lock on the page, because it allows us to
354 * double check that the page contents are empty (this should never
355 * happen, but if it does we don't want to risk wiping out valid data).
359 elog(
ERROR,
"page %u of relation \"%s\" should be empty but is not",
367 * If we decided to put pages into the FSM, release the buffer lock (but
368 * not pin), we don't want to do IO while holding a buffer lock. This will
369 * necessitate a bit more extensive checking in our caller.
371 if (use_fsm && not_in_fsm_pages < extend_by_pages)
380 * Relation is now extended. Release pins on all buffers, except for the
381 * first (which we'll return). If we decided to put pages into the FSM,
382 * we can do that as part of the same loop.
384 for (
uint32 i = 1;
i < extend_by_pages;
i++)
393 if (use_fsm &&
i >= not_in_fsm_pages)
402 if (use_fsm && not_in_fsm_pages < extend_by_pages)
404 BlockNumber first_fsm_block = first_block + not_in_fsm_pages;
412 * Remember the additional pages we extended by, so we later can use
413 * them without looking into the FSM.
415 if (extend_by_pages > 1)
426 /* maintain bistate->current_buf */
433#undef MAX_BUFFERS_TO_EXTEND_BY
437 * RelationGetBufferForTuple
439 * Returns pinned and exclusive-locked buffer of a page in given relation
440 * with free space >= given len.
442 * If num_pages is > 1, we will try to extend the relation by at least that
443 * many pages when we decide to extend the relation. This is more efficient
444 * for callers that know they will need multiple pages
445 * (e.g. heap_multi_insert()).
447 * If otherBuffer is not InvalidBuffer, then it references a previously
448 * pinned buffer of another page in the same relation; on return, this
449 * buffer will also be exclusive-locked. (This case is used by heap_update;
450 * the otherBuffer contains the tuple being updated.)
452 * The reason for passing otherBuffer is that if two backends are doing
453 * concurrent heap_update operations, a deadlock could occur if they try
454 * to lock the same two buffers in opposite orders. To ensure that this
455 * can't happen, we impose the rule that buffers of a relation must be
456 * locked in increasing page number order. This is most conveniently done
457 * by having RelationGetBufferForTuple lock them both, with suitable care
460 * NOTE: it is unlikely, but not quite impossible, for otherBuffer to be the
461 * same buffer we select for insertion of the new tuple (this could only
462 * happen if space is freed in that page after heap_update finds there's not
463 * enough there). In that case, the page will be pinned and locked only once.
465 * We also handle the possibility that the all-visible flag will need to be
466 * cleared on one or both pages. If so, pin on the associated visibility map
467 * page must be acquired before acquiring buffer lock(s), to avoid possibly
468 * doing I/O while holding buffer locks. The pins are passed back to the
469 * caller using the input-output arguments vmbuffer and vmbuffer_other.
470 * Note that in some cases the caller might have already acquired such pins,
471 * which is indicated by these arguments not being InvalidBuffer on entry.
473 * We normally use FSM to help us find free space. However,
474 * if HEAP_INSERT_SKIP_FSM is specified, we just append a new empty page to
475 * the end of the relation if the tuple won't fit on the current target page.
476 * This can save some cycles when we know the relation is new and doesn't
477 * contain useful amounts of free space.
479 * HEAP_INSERT_SKIP_FSM is also useful for non-WAL-logged additions to a
480 * relation, if the caller holds exclusive lock and is careful to invalidate
481 * relation's smgr_targblock before the first insertion --- that ensures that
482 * all insertions will occur into newly added pages and not be intermixed
483 * with tuples from other transactions. That way, a crash can't risk losing
484 * any committed data of other transactions. (See heap_insert's comments
485 * for additional constraints needed for safe usage of this behavior.)
487 * The caller can also provide a BulkInsertState object to optimize many
488 * insertions into the same relation. This keeps a pin on the current
489 * insertion target page (to save pin/unpin cycles) and also passes a
490 * BULKWRITE buffer selection strategy object to the buffer manager.
491 * Passing NULL for bistate selects the default behavior.
493 * We don't fill existing pages further than the fillfactor, except for large
494 * tuples in nearly-empty pages. This is OK since this routine is not
495 * consulted when updating a tuple and keeping it on the same page, which is
496 * the scenario fillfactor is meant to reserve space for.
498 * ereport(ERROR) is allowed here, so this routine *must* be called
499 * before any (unlogged) changes are made in buffer pool.
511 Size nearlyEmptyFreeSpace,
517 bool unlockedTargetBuffer;
522 /* if the caller doesn't know by how many pages to extend, extend by 1 */
526 /* Bulk insert is not supported for updates, only inserts. */
530 * If we're gonna fail for oversize tuple, do it right away
534 (
errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
535 errmsg(
"row is too big: size %zu, maximum size %zu",
538 /* Compute desired extra freespace due to fillfactor option */
543 * Since pages without tuples can still have line pointers, we consider
544 * pages "empty" when the unavailable space is slight. This threshold is
545 * somewhat arbitrary, but it should prevent most unnecessary relation
546 * extensions while inserting large tuples into low-fillfactor tables.
550 if (
len + saveFreeSpace > nearlyEmptyFreeSpace)
551 targetFreeSpace =
Max(
len, nearlyEmptyFreeSpace);
553 targetFreeSpace =
len + saveFreeSpace;
561 * We first try to put the tuple on the same page we last inserted a tuple
562 * on, as cached in the BulkInsertState or relcache entry. If that
563 * doesn't work, we ask the Free Space Map to locate a suitable page.
564 * Since the FSM's info might be out of date, we have to be prepared to
565 * loop around and retry multiple times. (To ensure this isn't an infinite
566 * loop, we must update the FSM with the correct amount of free space on
567 * each page that proves not to be suitable.) If the FSM has no record of
568 * a page with enough free space, we give up and extend the relation.
570 * When use_fsm is false, we either put the tuple onto the existing target
571 * page or extend the relation.
581 * We have no cached target page, so ask the FSM for an initial
588 * If the FSM knows nothing of the rel, try the last page before we give
589 * up and extend. This avoids one-tuple-per-page syndrome during
590 * bootstrapping or in a recently-started system.
597 targetBlock = nblocks - 1;
604 * Read and exclusive-lock the target block, as well as the other
605 * block if one was given, taking suitable care with lock ordering and
606 * the possibility they are the same block.
608 * If the page-level all-visible flag is set, caller will need to
609 * clear both that and the corresponding visibility map bit. However,
610 * by the time we return, we'll have x-locked the buffer, and we don't
611 * want to do any I/O while in that state. So we check the bit here
612 * before taking the lock, and pin the page if it appears necessary.
613 * Checking without the lock creates a risk of getting the wrong
614 * answer, so we'll have to recheck after acquiring the lock.
624 * If the page is empty, pin vmbuffer to set all_frozen bit later.
632 else if (otherBlock == targetBlock)
635 buffer = otherBuffer;
640 else if (otherBlock < targetBlock)
642 /* lock other buffer first */
651 /* lock target buffer first */
660 * We now have the target page (and the other buffer, if any) pinned
661 * and locked. However, since our initial PageIsAllVisible checks
662 * were performed before acquiring the lock, the results might now be
663 * out of date, either for the selected victim buffer, or for the
664 * other buffer passed by the caller. In that case, we'll need to
665 * give up our locks, go get the pin(s) we failed to get earlier, and
666 * re-lock. That's pretty painful, but hopefully shouldn't happen
669 * Note that there's a small possibility that we didn't pin the page
670 * above but still have the correct page pinned anyway, either because
671 * we've already made a previous pass through this loop, or because
672 * caller passed us the right page anyway.
674 * Note also that it's possible that by the time we get the pin and
675 * retake the buffer locks, the visibility map bit will have been
676 * cleared by some other backend anyway. In that case, we'll have
677 * done a bit of extra work for no gain, but there's no real harm
681 targetBlock, otherBlock, vmbuffer,
685 * Now we can check to see if there's enough free space here. If so,
691 * If necessary initialize page, it'll be used soon. We could avoid
692 * dirtying the buffer here, and rely on the caller to do so whenever
693 * it puts a tuple onto the page, but there seems not much benefit in
703 if (targetFreeSpace <= pageFreeSpace)
705 /* use this page as future insert target, too */
711 * Not enough space, so we must give up our page locks and pin (if
712 * any) and prepare to look elsewhere. We don't care which order we
713 * unlock the two buffers in, so this can be slightly simpler than the
719 else if (otherBlock != targetBlock)
725 /* Is there an ongoing bulk extension? */
731 * We bulk extended the relation before, and there are still some
732 * unused pages from that extension, so we don't need to look in
733 * the FSM for a new page. But do record the free space from the
734 * last page, somebody might insert narrower tuples later.
750 /* Without FSM, always fall out of the loop and extend */
756 * Update FSM as to condition of this page, and ask for another
766 /* Have to extend the relation */
768 &unlockedTargetBuffer);
774 * The page is empty, pin vmbuffer to set all_frozen bit. We don't want to
775 * do IO while the buffer is locked, so we unlock the page first if IO is
776 * needed (necessitating checks below).
784 if (!unlockedTargetBuffer)
786 unlockedTargetBuffer =
true;
792 * Reacquire locks if necessary.
794 * If the target buffer was unlocked above, or is unlocked while
795 * reacquiring the lock on otherBuffer below, it's unlikely, but possible,
796 * that another backend used space on this page. We check for that below,
797 * and retry if necessary.
799 recheckVmPins =
false;
800 if (unlockedTargetBuffer)
802 /* released lock on target buffer above */
806 recheckVmPins =
true;
811 * We did not release the target buffer, and otherBuffer is valid,
812 * need to lock the other buffer. It's guaranteed to be of a lower
813 * page number than the new page. To conform with the deadlock
814 * prevent rules, we ought to lock otherBuffer first, but that would
815 * give other backends a chance to put tuples on our page. To reduce
816 * the likelihood of that, attempt to lock the other buffer
817 * conditionally, that's very likely to work.
819 * Alternatively, we could acquire the lock on otherBuffer before
820 * extending the relation, but that'd require holding the lock while
821 * performing IO, which seems worse than an unlikely retry.
823 Assert(otherBuffer != buffer);
824 Assert(targetBlock > otherBlock);
828 unlockedTargetBuffer =
true;
833 recheckVmPins =
true;
837 * If one of the buffers was unlocked (always the case if otherBuffer is
838 * valid), it's possible, although unlikely, that an all-visible flag
839 * became set. We can use GetVisibilityMapPins to deal with that. It's
840 * possible that GetVisibilityMapPins() might need to temporarily release
841 * buffer locks, in which case we'll need to check if there's still enough
842 * space on the page below.
847 otherBlock, targetBlock, vmbuffer_other,
849 unlockedTargetBuffer =
true;
853 * If the target buffer was temporarily unlocked since the relation
854 * extension, it's possible, although unlikely, that all the space on the
855 * page was already used. If so, we just retry from the start. If we
856 * didn't unlock, something has gone wrong if there's not enough space -
857 * the test at the top should have prevented reaching this case.
860 if (
len > pageFreeSpace)
862 if (unlockedTargetBuffer)
874 * Remember the new page as our target for future insertions.
876 * XXX should we enter the new page into the free space map immediately,
877 * or just keep it for this backend's exclusive use in the short run
878 * (until VACUUM sees it)? Seems to depend on whether you expect the
879 * current backend to make more insertions or not, which is probably a
880 * good bet most of the time. So for now, don't add it to FSM yet.
#define InvalidBlockNumber
static bool BlockNumberIsValid(BlockNumber blockNumber)
void IncrBufferRefCount(Buffer buffer)
BlockNumber BufferGetBlockNumber(Buffer buffer)
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
bool ConditionalLockBuffer(Buffer buffer)
void ReleaseBuffer(Buffer buffer)
void UnlockReleaseBuffer(Buffer buffer)
void MarkBufferDirty(Buffer buffer)
void LockBuffer(Buffer buffer, int mode)
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
#define BUFFER_LOCK_UNLOCK
#define RelationGetNumberOfBlocks(reln)
static Page BufferGetPage(Buffer buffer)
static Size BufferGetPageSize(Buffer buffer)
#define BUFFER_LOCK_EXCLUSIVE
@ RBM_ZERO_AND_CLEANUP_LOCK
static bool BufferIsValid(Buffer bufnum)
Size PageGetHeapFreeSpace(const PageData *page)
void PageInit(Page page, Size pageSize, Size specialSize)
static bool PageIsAllVisible(const PageData *page)
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
static bool PageIsNew(const PageData *page)
#define SizeOfPageHeaderData
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
BlockNumber RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, Size oldSpaceAvail, Size spaceNeeded)
void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
Assert(PointerIsAligned(start, uint64))
#define HEAP_INSERT_SKIP_FSM
#define HEAP_INSERT_FROZEN
void RelationPutHeapTuple(Relation relation, Buffer buffer, HeapTuple tuple, bool token)
static Buffer RelationAddBlocks(Relation relation, BulkInsertState bistate, int num_pages, bool use_fsm, bool *did_unlock)
static bool GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, BlockNumber block1, BlockNumber block2, Buffer *vmbuffer1, Buffer *vmbuffer2)
#define MAX_BUFFERS_TO_EXTEND_BY
Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertState bistate, Buffer *vmbuffer, Buffer *vmbuffer_other, int num_pages)
static Buffer ReadBufferBI(Relation relation, BlockNumber targetBlock, ReadBufferMode mode, BulkInsertState bistate)
HeapTupleHeaderData * HeapTupleHeader
#define HEAP_XMAX_IS_MULTI
#define HEAP_XMAX_COMMITTED
static bool HeapTupleHeaderIsSpeculative(const HeapTupleHeaderData *tup)
#define MaxHeapTuplesPerPage
struct ItemIdData ItemIdData
static void ItemPointerSet(ItemPointerData *pointer, BlockNumber blockNumber, OffsetNumber offNum)
int RelationExtensionLockWaiterCount(Relation relation)
#define InvalidOffsetNumber
static PgChecksumMode mode
#define RELATION_IS_LOCAL(relation)
#define RelationGetTargetPageFreeSpace(relation, defaultff)
#define RelationGetRelationName(relation)
#define RelationGetTargetBlock(relation)
#define RelationSetTargetBlock(relation, targblock)
#define HEAP_DEFAULT_FILLFACTOR
BufferAccessStrategy strategy
uint32 already_extended_by
bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf)
void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
static StringInfoData tmpbuf