507{
511 Size nearlyEmptyFreeSpace,
512 pageFreeSpace = 0,
513 saveFreeSpace = 0,
514 targetFreeSpace = 0;
516 otherBlock;
517 bool unlockedTargetBuffer;
518 bool recheckVmPins;
519
521
522 /* if the caller doesn't know by how many pages to extend, extend by 1 */
523 if (num_pages <= 0)
524 num_pages = 1;
525
526 /* Bulk insert is not supported for updates, only inserts. */
528
529 /*
530 * If we're gonna fail for oversize tuple, do it right away
531 */
534 (
errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
535 errmsg(
"row is too big: size %zu, maximum size %zu",
537
538 /* Compute desired extra freespace due to fillfactor option */
541
542 /*
543 * Since pages without tuples can still have line pointers, we consider
544 * pages "empty" when the unavailable space is slight. This threshold is
545 * somewhat arbitrary, but it should prevent most unnecessary relation
546 * extensions while inserting large tuples into low-fillfactor tables.
547 */
550 if (
len + saveFreeSpace > nearlyEmptyFreeSpace)
551 targetFreeSpace =
Max(
len, nearlyEmptyFreeSpace);
552 else
553 targetFreeSpace =
len + saveFreeSpace;
554
557 else
559
560 /*
561 * We first try to put the tuple on the same page we last inserted a tuple
562 * on, as cached in the BulkInsertState or relcache entry. If that
563 * doesn't work, we ask the Free Space Map to locate a suitable page.
564 * Since the FSM's info might be out of date, we have to be prepared to
565 * loop around and retry multiple times. (To ensure this isn't an infinite
566 * loop, we must update the FSM with the correct amount of free space on
567 * each page that proves not to be suitable.) If the FSM has no record of
568 * a page with enough free space, we give up and extend the relation.
569 *
570 * When use_fsm is false, we either put the tuple onto the existing target
571 * page or extend the relation.
572 */
575 else
577
579 {
580 /*
581 * We have no cached target page, so ask the FSM for an initial
582 * target.
583 */
585 }
586
587 /*
588 * If the FSM knows nothing of the rel, try the last page before we give
589 * up and extend. This avoids one-tuple-per-page syndrome during
590 * bootstrapping or in a recently-started system.
591 */
593 {
595
596 if (nblocks > 0)
597 targetBlock = nblocks - 1;
598 }
599
600loop:
602 {
603 /*
604 * Read and exclusive-lock the target block, as well as the other
605 * block if one was given, taking suitable care with lock ordering and
606 * the possibility they are the same block.
607 *
608 * If the page-level all-visible flag is set, caller will need to
609 * clear both that and the corresponding visibility map bit. However,
610 * by the time we return, we'll have x-locked the buffer, and we don't
611 * want to do any I/O while in that state. So we check the bit here
612 * before taking the lock, and pin the page if it appears necessary.
613 * Checking without the lock creates a risk of getting the wrong
614 * answer, so we'll have to recheck after acquiring the lock.
615 */
617 {
618 /* easy case */
622
623 /*
624 * If the page is empty, pin vmbuffer to set all_frozen bit later.
625 */
629
631 }
632 else if (otherBlock == targetBlock)
633 {
634 /* also easy case */
635 buffer = otherBuffer;
639 }
640 else if (otherBlock < targetBlock)
641 {
642 /* lock other buffer first */
648 }
649 else
650 {
651 /* lock target buffer first */
657 }
658
659 /*
660 * We now have the target page (and the other buffer, if any) pinned
661 * and locked. However, since our initial PageIsAllVisible checks
662 * were performed before acquiring the lock, the results might now be
663 * out of date, either for the selected victim buffer, or for the
664 * other buffer passed by the caller. In that case, we'll need to
665 * give up our locks, go get the pin(s) we failed to get earlier, and
666 * re-lock. That's pretty painful, but hopefully shouldn't happen
667 * often.
668 *
669 * Note that there's a small possibility that we didn't pin the page
670 * above but still have the correct page pinned anyway, either because
671 * we've already made a previous pass through this loop, or because
672 * caller passed us the right page anyway.
673 *
674 * Note also that it's possible that by the time we get the pin and
675 * retake the buffer locks, the visibility map bit will have been
676 * cleared by some other backend anyway. In that case, we'll have
677 * done a bit of extra work for no gain, but there's no real harm
678 * done.
679 */
681 targetBlock, otherBlock, vmbuffer,
682 vmbuffer_other);
683
684 /*
685 * Now we can check to see if there's enough free space here. If so,
686 * we're done.
687 */
689
690 /*
691 * If necessary initialize page, it'll be used soon. We could avoid
692 * dirtying the buffer here, and rely on the caller to do so whenever
693 * it puts a tuple onto the page, but there seems not much benefit in
694 * doing so.
695 */
697 {
700 }
701
703 if (targetFreeSpace <= pageFreeSpace)
704 {
705 /* use this page as future insert target, too */
707 return buffer;
708 }
709
710 /*
711 * Not enough space, so we must give up our page locks and pin (if
712 * any) and prepare to look elsewhere. We don't care which order we
713 * unlock the two buffers in, so this can be slightly simpler than the
714 * code above.
715 */
719 else if (otherBlock != targetBlock)
720 {
723 }
724
725 /* Is there an ongoing bulk extension? */
727 {
729
730 /*
731 * We bulk extended the relation before, and there are still some
732 * unused pages from that extension, so we don't need to look in
733 * the FSM for a new page. But do record the free space from the
734 * last page, somebody might insert narrower tuples later.
735 */
736 if (use_fsm)
738
741 {
744 }
745 else
747 }
748 else if (!use_fsm)
749 {
750 /* Without FSM, always fall out of the loop and extend */
751 break;
752 }
753 else
754 {
755 /*
756 * Update FSM as to condition of this page, and ask for another
757 * page to try.
758 */
760 targetBlock,
761 pageFreeSpace,
762 targetFreeSpace);
763 }
764 }
765
766 /* Have to extend the relation */
768 &unlockedTargetBuffer);
769
772
773 /*
774 * The page is empty, pin vmbuffer to set all_frozen bit. We don't want to
775 * do IO while the buffer is locked, so we unlock the page first if IO is
776 * needed (necessitating checks below).
777 */
779 {
781
783 {
784 if (!unlockedTargetBuffer)
786 unlockedTargetBuffer = true;
788 }
789 }
790
791 /*
792 * Reacquire locks if necessary.
793 *
794 * If the target buffer was unlocked above, or is unlocked while
795 * reacquiring the lock on otherBuffer below, it's unlikely, but possible,
796 * that another backend used space on this page. We check for that below,
797 * and retry if necessary.
798 */
799 recheckVmPins = false;
800 if (unlockedTargetBuffer)
801 {
802 /* released lock on target buffer above */
806 recheckVmPins = true;
807 }
809 {
810 /*
811 * We did not release the target buffer, and otherBuffer is valid,
812 * need to lock the other buffer. It's guaranteed to be of a lower
813 * page number than the new page. To conform with the deadlock
814 * prevent rules, we ought to lock otherBuffer first, but that would
815 * give other backends a chance to put tuples on our page. To reduce
816 * the likelihood of that, attempt to lock the other buffer
817 * conditionally, that's very likely to work.
818 *
819 * Alternatively, we could acquire the lock on otherBuffer before
820 * extending the relation, but that'd require holding the lock while
821 * performing IO, which seems worse than an unlikely retry.
822 */
823 Assert(otherBuffer != buffer);
824 Assert(targetBlock > otherBlock);
825
827 {
828 unlockedTargetBuffer = true;
832 }
833 recheckVmPins = true;
834 }
835
836 /*
837 * If one of the buffers was unlocked (always the case if otherBuffer is
838 * valid), it's possible, although unlikely, that an all-visible flag
839 * became set. We can use GetVisibilityMapPins to deal with that. It's
840 * possible that GetVisibilityMapPins() might need to temporarily release
841 * buffer locks, in which case we'll need to check if there's still enough
842 * space on the page below.
843 */
844 if (recheckVmPins)
845 {
847 otherBlock, targetBlock, vmbuffer_other,
848 vmbuffer))
849 unlockedTargetBuffer = true;
850 }
851
852 /*
853 * If the target buffer was temporarily unlocked since the relation
854 * extension, it's possible, although unlikely, that all the space on the
855 * page was already used. If so, we just retry from the start. If we
856 * didn't unlock, something has gone wrong if there's not enough space -
857 * the test at the top should have prevented reaching this case.
858 */
860 if (
len > pageFreeSpace)
861 {
862 if (unlockedTargetBuffer)
863 {
867
868 goto loop;
869 }
871 }
872
873 /*
874 * Remember the new page as our target for future insertions.
875 *
876 * XXX should we enter the new page into the free space map immediately,
877 * or just keep it for this backend's exclusive use in the short run
878 * (until VACUUM sees it)? Seems to depend on whether you expect the
879 * current backend to make more insertions or not, which is probably a
880 * good bet most of the time. So for now, don't add it to FSM yet.
881 */
883
884 return buffer;
885}
bool ConditionalLockBuffer(Buffer buffer)
void UnlockReleaseBuffer(Buffer buffer)
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
#define RelationGetNumberOfBlocks(reln)
Size PageGetHeapFreeSpace(const PageData *page)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
BlockNumber RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, Size oldSpaceAvail, Size spaceNeeded)
BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
#define HEAP_INSERT_SKIP_FSM
#define HEAP_INSERT_FROZEN
static Buffer RelationAddBlocks(Relation relation, BulkInsertState bistate, int num_pages, bool use_fsm, bool *did_unlock)
static bool GetVisibilityMapPins(Relation relation, Buffer buffer1, Buffer buffer2, BlockNumber block1, BlockNumber block2, Buffer *vmbuffer1, Buffer *vmbuffer2)
static Buffer ReadBufferBI(Relation relation, BlockNumber targetBlock, ReadBufferMode mode, BulkInsertState bistate)
#define MaxHeapTuplesPerPage
struct ItemIdData ItemIdData
#define RelationGetTargetPageFreeSpace(relation, defaultff)
#define RelationGetTargetBlock(relation)
#define RelationSetTargetBlock(relation, targblock)
#define HEAP_DEFAULT_FILLFACTOR