PostgreSQL Source Code git master
Data Structures | Macros | Typedefs | Functions
verify_nbtree.c File Reference
#include "postgres.h"
#include "access/heaptoast.h"
#include "access/htup_details.h"
#include "access/nbtree.h"
#include "access/table.h"
#include "access/tableam.h"
#include "access/transam.h"
#include "access/xact.h"
#include "verify_common.h"
#include "catalog/index.h"
#include "catalog/pg_am.h"
#include "catalog/pg_opfamily_d.h"
#include "common/pg_prng.h"
#include "lib/bloomfilter.h"
#include "miscadmin.h"
#include "storage/smgr.h"
#include "utils/guc.h"
#include "utils/memutils.h"
#include "utils/snapmgr.h"
Include dependency graph for verify_nbtree.c:

Go to the source code of this file.

Data Structures

struct   BtreeCheckState
 
struct   BtreeLevel
 
 
struct   BTCallbackState
 

Macros

 
 

Typedefs

typedef struct BtreeCheckState  BtreeCheckState
 
typedef struct BtreeLevel  BtreeLevel
 
 
typedef struct BTCallbackState  BTCallbackState
 

Functions

  PG_MODULE_MAGIC_EXT (.name="amcheck",.version=PG_VERSION)
 
 
 
static void  bt_index_check_callback (Relation indrel, Relation heaprel, void *state, bool readonly)
 
static void  bt_check_every_level (Relation rel, Relation heaprel, bool heapkeyspace, bool readonly, bool heapallindexed, bool rootdescend, bool checkunique)
 
 
 
static void  bt_recheck_sibling_links (BtreeCheckState *state, BlockNumber btpo_prev_from_target, BlockNumber leftcurrent)
 
 
static void  bt_report_duplicate (BtreeCheckState *state, BtreeLastVisibleEntry *lVis, ItemPointer nexttid, BlockNumber nblock, OffsetNumber noffset, int nposting)
 
 
 
 
static void  bt_child_check (BtreeCheckState *state, BTScanInsert targetkey, OffsetNumber downlinkoffnum)
 
static void  bt_child_highkey_check (BtreeCheckState *state, OffsetNumber target_downlinkoffnum, Page loaded_child, uint32 target_level)
 
static void  bt_downlink_missing_check (BtreeCheckState *state, bool rightsplit, BlockNumber blkno, Page page)
 
static void  bt_tuple_present_callback (Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate)
 
 
 
 
static bool  offset_is_negative_infinity (BTPageOpaque opaque, OffsetNumber offset)
 
 
 
 
static bool  invariant_l_nontarget_offset (BtreeCheckState *state, BTScanInsert key, BlockNumber nontargetblock, Page nontarget, OffsetNumber upperbound)
 
 
 
 
 
 
 
 
static bool  bt_pivot_tuple_identical (bool heapkeyspace, IndexTuple itup1, IndexTuple itup2)
 

Macro Definition Documentation

BTreeTupleGetNKeyAtts

#define BTreeTupleGetNKeyAtts (   itup,
  rel 

Definition at line 56 of file verify_nbtree.c.

InvalidBtreeLevel

#define InvalidBtreeLevel   ((uint32) InvalidBlockNumber)

Definition at line 55 of file verify_nbtree.c.

Typedef Documentation

BTCallbackState

BtreeCheckState

BtreeLastVisibleEntry

BtreeLevel

typedef struct BtreeLevel BtreeLevel

Function Documentation

bt_check_every_level()

static void bt_check_every_level ( Relation  rel,
Relation  heaprel,
bool  heapkeyspace,
bool  readonly,
bool  heapallindexed,
bool  rootdescend,
bool  checkunique 
)
static

Definition at line 376 of file verify_nbtree.c.

379{
381 Page metapage;
382 BTMetaPageData *metad;
383 uint32 previouslevel;
384 BtreeLevel current;
385 Snapshot snapshot = SnapshotAny;
386
387 if (!readonly)
388 elog(DEBUG1, "verifying consistency of tree structure for index \"%s\"",
390 else
391 elog(DEBUG1, "verifying consistency of tree structure for index \"%s\" with cross-level checks",
393
394 /*
395 * This assertion matches the one in index_getnext_tid(). See page
396 * recycling/"visible to everyone" notes in nbtree README.
397 */
399
400 /*
401 * Initialize state for entire verification operation
402 */
403 state = palloc0(sizeof(BtreeCheckState));
404 state->rel = rel;
405 state->heaprel = heaprel;
406 state->heapkeyspace = heapkeyspace;
407 state->readonly = readonly;
408 state->heapallindexed = heapallindexed;
409 state->rootdescend = rootdescend;
410 state->checkunique = checkunique;
411 state->snapshot = InvalidSnapshot;
412
413 if (state->heapallindexed)
414 {
415 int64 total_pages;
416 int64 total_elems;
417 uint64 seed;
418
419 /*
420 * Size Bloom filter based on estimated number of tuples in index,
421 * while conservatively assuming that each block must contain at least
422 * MaxTIDsPerBTreePage / 3 "plain" tuples -- see
423 * bt_posting_plain_tuple() for definition, and details of how posting
424 * list tuples are handled.
425 */
426 total_pages = RelationGetNumberOfBlocks(rel);
427 total_elems = Max(total_pages * (MaxTIDsPerBTreePage / 3),
428 (int64) state->rel->rd_rel->reltuples);
429 /* Generate a random seed to avoid repetition */
431 /* Create Bloom filter to fingerprint index */
432 state->filter = bloom_create(total_elems, maintenance_work_mem, seed);
433 state->heaptuplespresent = 0;
434
435 /*
436 * Register our own snapshot in !readonly case, rather than asking
437 * table_index_build_scan() to do this for us later. This needs to
438 * happen before index fingerprinting begins, so we can later be
439 * certain that index fingerprinting should have reached all tuples
440 * returned by table_index_build_scan().
441 */
442 if (!state->readonly)
443 {
445
446 /*
447 * GetTransactionSnapshot() always acquires a new MVCC snapshot in
448 * READ COMMITTED mode. A new snapshot is guaranteed to have all
449 * the entries it requires in the index.
450 *
451 * We must defend against the possibility that an old xact
452 * snapshot was returned at higher isolation levels when that
453 * snapshot is not safe for index scans of the target index. This
454 * is possible when the snapshot sees tuples that are before the
455 * index's indcheckxmin horizon. Throwing an error here should be
456 * very rare. It doesn't seem worth using a secondary snapshot to
457 * avoid this.
458 */
459 if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin &&
461 snapshot->xmin))
464 errmsg("index \"%s\" cannot be verified using transaction snapshot",
466 }
467 }
468
469 /*
470 * We need a snapshot to check the uniqueness of the index. For better
471 * performance take it once per index check. If snapshot already taken
472 * reuse it.
473 */
474 if (state->checkunique)
475 {
476 state->indexinfo = BuildIndexInfo(state->rel);
477 if (state->indexinfo->ii_Unique)
478 {
479 if (snapshot != SnapshotAny)
480 state->snapshot = snapshot;
481 else
483 }
484 }
485
486 Assert(!state->rootdescend || state->readonly);
487 if (state->rootdescend && !state->heapkeyspace)
489 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
490 errmsg("cannot verify that tuples from index \"%s\" can each be found by an independent index search",
492 errhint("Only B-Tree version 4 indexes support rootdescend verification.")));
493
494 /* Create context for page */
496 "amcheck context",
498 state->checkstrategy = GetAccessStrategy(BAS_BULKREAD);
499
500 /* Get true root block from meta-page */
502 metad = BTPageGetMeta(metapage);
503
504 /*
505 * Certain deletion patterns can result in "skinny" B-Tree indexes, where
506 * the fast root and true root differ.
507 *
508 * Start from the true root, not the fast root, unlike conventional index
509 * scans. This approach is more thorough, and removes the risk of
510 * following a stale fast root from the meta page.
511 */
512 if (metad->btm_fastroot != metad->btm_root)
514 (errcode(ERRCODE_NO_DATA),
515 errmsg_internal("harmless fast root mismatch in index \"%s\"",
517 errdetail_internal("Fast root block %u (level %u) differs from true root block %u (level %u).",
518 metad->btm_fastroot, metad->btm_fastlevel,
519 metad->btm_root, metad->btm_level)));
520
521 /*
522 * Starting at the root, verify every level. Move left to right, top to
523 * bottom. Note that there may be no pages other than the meta page (meta
524 * page can indicate that root is P_NONE when the index is totally empty).
525 */
526 previouslevel = InvalidBtreeLevel;
527 current.level = metad->btm_level;
528 current.leftmost = metad->btm_root;
529 current.istruerootlevel = true;
530 while (current.leftmost != P_NONE)
531 {
532 /*
533 * Verify this level, and get left most page for next level down, if
534 * not at leaf level
535 */
536 current = bt_check_level_from_leftmost(state, current);
537
538 if (current.leftmost == InvalidBlockNumber)
540 (errcode(ERRCODE_INDEX_CORRUPTED),
541 errmsg("index \"%s\" has no valid pages on level below %u or first level",
542 RelationGetRelationName(rel), previouslevel)));
543
544 previouslevel = current.level;
545 }
546
547 /*
548 * * Check whether heap contains unindexed/malformed tuples *
549 */
550 if (state->heapallindexed)
551 {
552 IndexInfo *indexinfo = BuildIndexInfo(state->rel);
553 TableScanDesc scan;
554
555 /*
556 * Create our own scan for table_index_build_scan(), rather than
557 * getting it to do so for us. This is required so that we can
558 * actually use the MVCC snapshot registered earlier in !readonly
559 * case.
560 *
561 * Note that table_index_build_scan() calls heap_endscan() for us.
562 */
563 scan = table_beginscan_strat(state->heaprel, /* relation */
564 snapshot, /* snapshot */
565 0, /* number of keys */
566 NULL, /* scan key */
567 true, /* buffer access strategy OK */
568 true); /* syncscan OK? */
569
570 /*
571 * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY
572 * behaves in !readonly case.
573 *
574 * It's okay that we don't actually use the same lock strength for the
575 * heap relation as any other ii_Concurrent caller would in !readonly
576 * case. We have no reason to care about a concurrent VACUUM
577 * operation, since there isn't going to be a second scan of the heap
578 * that needs to be sure that there was no concurrent recycling of
579 * TIDs.
580 */
581 indexinfo->ii_Concurrent = !state->readonly;
582
583 /*
584 * Don't wait for uncommitted tuple xact commit/abort when index is a
585 * unique index on a catalog (or an index used by an exclusion
586 * constraint). This could otherwise happen in the readonly case.
587 */
588 indexinfo->ii_Unique = false;
589 indexinfo->ii_ExclusionOps = NULL;
590 indexinfo->ii_ExclusionProcs = NULL;
591 indexinfo->ii_ExclusionStrats = NULL;
592
593 elog(DEBUG1, "verifying that tuples from index \"%s\" are present in \"%s\"",
596
597 table_index_build_scan(state->heaprel, state->rel, indexinfo, true, false,
599
601 (errmsg_internal("finished verifying presence of " INT64_FORMAT " tuples from table \"%s\" with bitset %.2f%% set",
602 state->heaptuplespresent, RelationGetRelationName(heaprel),
603 100.0 * bloom_prop_bits_set(state->filter))));
604
605 if (snapshot != SnapshotAny)
606 UnregisterSnapshot(snapshot);
607
608 bloom_free(state->filter);
609 }
610
611 /* Be tidy: */
612 if (snapshot == SnapshotAny && state->snapshot != InvalidSnapshot)
613 UnregisterSnapshot(state->snapshot);
614 MemoryContextDelete(state->targetcontext);
615}
#define InvalidBlockNumber
Definition: block.h:33
void bloom_free(bloom_filter *filter)
Definition: bloomfilter.c:126
bloom_filter * bloom_create(int64 total_elems, int bloom_work_mem, uint64 seed)
Definition: bloomfilter.c:87
double bloom_prop_bits_set(bloom_filter *filter)
Definition: bloomfilter.c:187
@ BAS_BULKREAD
Definition: bufmgr.h:37
#define RelationGetNumberOfBlocks(reln)
Definition: bufmgr.h:283
PageData * Page
Definition: bufpage.h:82
#define Max(x, y)
Definition: c.h:997
#define INT64_FORMAT
Definition: c.h:556
int64_t int64
Definition: c.h:535
uint64_t uint64
Definition: c.h:539
uint32_t uint32
Definition: c.h:538
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1161
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1234
int errhint(const char *fmt,...)
Definition: elog.c:1321
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define DEBUG1
Definition: elog.h:30
#define ERROR
Definition: elog.h:39
#define elog(elevel,...)
Definition: elog.h:226
#define ereport(elevel,...)
Definition: elog.h:150
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:424
int maintenance_work_mem
Definition: globals.c:133
Assert(PointerIsAligned(start, uint64))
static TransactionId HeapTupleHeaderGetXmin(const HeapTupleHeaderData *tup)
Definition: htup_details.h:324
IndexInfo * BuildIndexInfo(Relation index)
Definition: index.c:2428
void * palloc0(Size size)
Definition: mcxt.c:1395
MemoryContext CurrentMemoryContext
Definition: mcxt.c:160
void MemoryContextDelete(MemoryContext context)
Definition: mcxt.c:469
#define AllocSetContextCreate
Definition: memutils.h:129
#define ALLOCSET_DEFAULT_SIZES
Definition: memutils.h:160
#define BTPageGetMeta(p)
Definition: nbtree.h:122
#define MaxTIDsPerBTreePage
Definition: nbtree.h:186
#define P_NONE
Definition: nbtree.h:213
#define BTREE_METAPAGE
Definition: nbtree.h:149
uint64 pg_prng_uint64(pg_prng_state *state)
Definition: pg_prng.c:134
pg_prng_state pg_global_prng_state
Definition: pg_prng.c:34
#define ERRCODE_T_R_SERIALIZATION_FAILURE
Definition: pgbench.c:77
#define RelationGetRelationName(relation)
Definition: rel.h:548
TransactionId RecentXmin
Definition: snapmgr.c:159
Snapshot GetTransactionSnapshot(void)
Definition: snapmgr.c:271
void UnregisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:864
Snapshot RegisterSnapshot(Snapshot snapshot)
Definition: snapmgr.c:822
#define SnapshotAny
Definition: snapmgr.h:33
#define InvalidSnapshot
Definition: snapshot.h:119
uint32 btm_level
Definition: nbtree.h:109
BlockNumber btm_fastroot
Definition: nbtree.h:110
BlockNumber btm_root
Definition: nbtree.h:108
uint32 btm_fastlevel
Definition: nbtree.h:111
bool istruerootlevel
Definition: verify_nbtree.c:147
uint32 level
Definition: verify_nbtree.c:141
BlockNumber leftmost
Definition: verify_nbtree.c:144
HeapTupleHeader t_data
Definition: htup.h:68
bool ii_Unique
Definition: execnodes.h:200
uint16 * ii_ExclusionStrats
Definition: execnodes.h:192
Oid * ii_ExclusionOps
Definition: execnodes.h:188
bool ii_Concurrent
Definition: execnodes.h:210
Oid * ii_ExclusionProcs
Definition: execnodes.h:190
struct HeapTupleData * rd_indextuple
Definition: rel.h:194
Form_pg_index rd_index
Definition: rel.h:192
TransactionId xmin
Definition: snapshot.h:153
Definition: regguts.h:323
static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
Definition: tableam.h:1744
static TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, ScanKeyData *key, bool allow_strat, bool allow_sync)
Definition: tableam.h:900
bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition: transam.c:280
#define TransactionIdIsValid(xid)
Definition: transam.h:41
static BtreeLevel bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
Definition: verify_nbtree.c:636
static void bt_tuple_present_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *checkstate)
#define InvalidBtreeLevel
Definition: verify_nbtree.c:55
static Page palloc_btree_page(BtreeCheckState *state, BlockNumber blocknum)
#define IsolationUsesXactSnapshot()
Definition: xact.h:52

References ALLOCSET_DEFAULT_SIZES, AllocSetContextCreate, Assert(), BAS_BULKREAD, bloom_create(), bloom_free(), bloom_prop_bits_set(), bt_check_level_from_leftmost(), bt_tuple_present_callback(), BTMetaPageData::btm_fastlevel, BTMetaPageData::btm_fastroot, BTMetaPageData::btm_level, BTMetaPageData::btm_root, BTPageGetMeta, BTREE_METAPAGE, BuildIndexInfo(), CurrentMemoryContext, DEBUG1, elog, ereport, errcode(), ERRCODE_T_R_SERIALIZATION_FAILURE, errdetail_internal(), errhint(), errmsg(), errmsg_internal(), ERROR, GetAccessStrategy(), GetTransactionSnapshot(), HeapTupleHeaderGetXmin(), IndexInfo::ii_Concurrent, IndexInfo::ii_ExclusionOps, IndexInfo::ii_ExclusionProcs, IndexInfo::ii_ExclusionStrats, IndexInfo::ii_Unique, INT64_FORMAT, InvalidBlockNumber, InvalidBtreeLevel, InvalidSnapshot, IsolationUsesXactSnapshot, BtreeLevel::istruerootlevel, BtreeLevel::leftmost, BtreeLevel::level, maintenance_work_mem, Max, MaxTIDsPerBTreePage, MemoryContextDelete(), P_NONE, palloc0(), palloc_btree_page(), pg_global_prng_state, pg_prng_uint64(), RelationData::rd_index, RelationData::rd_indextuple, RecentXmin, RegisterSnapshot(), RelationGetNumberOfBlocks, RelationGetRelationName, SnapshotAny, HeapTupleData::t_data, table_beginscan_strat(), table_index_build_scan(), TransactionIdIsValid, TransactionIdPrecedes(), UnregisterSnapshot(), and SnapshotData::xmin.

Referenced by bt_index_check_callback().

bt_check_level_from_leftmost()

static BtreeLevel bt_check_level_from_leftmost ( BtreeCheckStatestate,
BtreeLevel  level 
)
static

Definition at line 636 of file verify_nbtree.c.

637{
638 /* State to establish early, concerning entire level */
639 BTPageOpaque opaque;
640 MemoryContext oldcontext;
641 BtreeLevel nextleveldown;
642
643 /* Variables for iterating across level using right links */
644 BlockNumber leftcurrent = P_NONE;
645 BlockNumber current = level.leftmost;
646
647 /* Initialize return state */
648 nextleveldown.leftmost = InvalidBlockNumber;
649 nextleveldown.level = InvalidBtreeLevel;
650 nextleveldown.istruerootlevel = false;
651
652 /* Use page-level context for duration of this call */
653 oldcontext = MemoryContextSwitchTo(state->targetcontext);
654
655 elog(DEBUG1, "verifying level %u%s", level.level,
656 level.istruerootlevel ?
657 " (true root level)" : level.level == 0 ? " (leaf level)" : "");
658
659 state->prevrightlink = InvalidBlockNumber;
660 state->previncompletesplit = false;
661
662 do
663 {
664 /* Don't rely on CHECK_FOR_INTERRUPTS() calls at lower level */
666
667 /* Initialize state for this iteration */
668 state->targetblock = current;
669 state->target = palloc_btree_page(state, state->targetblock);
670 state->targetlsn = PageGetLSN(state->target);
671
672 opaque = BTPageGetOpaque(state->target);
673
674 if (P_IGNORE(opaque))
675 {
676 /*
677 * Since there cannot be a concurrent VACUUM operation in readonly
678 * mode, and since a page has no links within other pages
679 * (siblings and parent) once it is marked fully deleted, it
680 * should be impossible to land on a fully deleted page in
681 * readonly mode. See bt_child_check() for further details.
682 *
683 * The bt_child_check() P_ISDELETED() check is repeated here so
684 * that pages that are only reachable through sibling links get
685 * checked.
686 */
687 if (state->readonly && P_ISDELETED(opaque))
689 (errcode(ERRCODE_INDEX_CORRUPTED),
690 errmsg("downlink or sibling link points to deleted block in index \"%s\"",
692 errdetail_internal("Block=%u left block=%u left link from block=%u.",
693 current, leftcurrent, opaque->btpo_prev)));
694
695 if (P_RIGHTMOST(opaque))
697 (errcode(ERRCODE_INDEX_CORRUPTED),
698 errmsg("block %u fell off the end of index \"%s\"",
699 current, RelationGetRelationName(state->rel))));
700 else
702 (errcode(ERRCODE_NO_DATA),
703 errmsg_internal("block %u of index \"%s\" concurrently deleted",
704 current, RelationGetRelationName(state->rel))));
705 goto nextpage;
706 }
707 else if (nextleveldown.leftmost == InvalidBlockNumber)
708 {
709 /*
710 * A concurrent page split could make the caller supplied leftmost
711 * block no longer contain the leftmost page, or no longer be the
712 * true root, but where that isn't possible due to heavyweight
713 * locking, check that the first valid page meets caller's
714 * expectations.
715 */
716 if (state->readonly)
717 {
718 if (!bt_leftmost_ignoring_half_dead(state, current, opaque))
720 (errcode(ERRCODE_INDEX_CORRUPTED),
721 errmsg("block %u is not leftmost in index \"%s\"",
722 current, RelationGetRelationName(state->rel))));
723
724 if (level.istruerootlevel && !P_ISROOT(opaque))
726 (errcode(ERRCODE_INDEX_CORRUPTED),
727 errmsg("block %u is not true root in index \"%s\"",
728 current, RelationGetRelationName(state->rel))));
729 }
730
731 /*
732 * Before beginning any non-trivial examination of level, prepare
733 * state for next bt_check_level_from_leftmost() invocation for
734 * the next level for the next level down (if any).
735 *
736 * There should be at least one non-ignorable page per level,
737 * unless this is the leaf level, which is assumed by caller to be
738 * final level.
739 */
740 if (!P_ISLEAF(opaque))
741 {
742 IndexTuple itup;
743 ItemId itemid;
744
745 /* Internal page -- downlink gets leftmost on next level */
746 itemid = PageGetItemIdCareful(state, state->targetblock,
747 state->target,
748 P_FIRSTDATAKEY(opaque));
749 itup = (IndexTuple) PageGetItem(state->target, itemid);
750 nextleveldown.leftmost = BTreeTupleGetDownLink(itup);
751 nextleveldown.level = opaque->btpo_level - 1;
752 }
753 else
754 {
755 /*
756 * Leaf page -- final level caller must process.
757 *
758 * Note that this could also be the root page, if there has
759 * been no root page split yet.
760 */
761 nextleveldown.leftmost = P_NONE;
762 nextleveldown.level = InvalidBtreeLevel;
763 }
764
765 /*
766 * Finished setting up state for this call/level. Control will
767 * never end up back here in any future loop iteration for this
768 * level.
769 */
770 }
771
772 /*
773 * Sibling links should be in mutual agreement. There arises
774 * leftcurrent == P_NONE && btpo_prev != P_NONE when the left sibling
775 * of the parent's low-key downlink is half-dead. (A half-dead page
776 * has no downlink from its parent.) Under heavyweight locking, the
777 * last bt_leftmost_ignoring_half_dead() validated this btpo_prev.
778 * Without heavyweight locking, validation of the P_NONE case remains
779 * unimplemented.
780 */
781 if (opaque->btpo_prev != leftcurrent && leftcurrent != P_NONE)
782 bt_recheck_sibling_links(state, opaque->btpo_prev, leftcurrent);
783
784 /* Check level */
785 if (level.level != opaque->btpo_level)
787 (errcode(ERRCODE_INDEX_CORRUPTED),
788 errmsg("leftmost down link for level points to block in index \"%s\" whose level is not one level down",
790 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
791 current, level.level, opaque->btpo_level)));
792
793 /* Verify invariants for page */
795
796nextpage:
797
798 /* Try to detect circular links */
799 if (current == leftcurrent || current == opaque->btpo_prev)
801 (errcode(ERRCODE_INDEX_CORRUPTED),
802 errmsg("circular link chain found in block %u of index \"%s\"",
803 current, RelationGetRelationName(state->rel))));
804
805 leftcurrent = current;
806 current = opaque->btpo_next;
807
808 if (state->lowkey)
809 {
810 Assert(state->readonly);
811 pfree(state->lowkey);
812 state->lowkey = NULL;
813 }
814
815 /*
816 * Copy current target high key as the low key of right sibling.
817 * Allocate memory in upper level context, so it would be cleared
818 * after reset of target context.
819 *
820 * We only need the low key in corner cases of checking child high
821 * keys. We use high key only when incomplete split on the child level
822 * falls to the boundary of pages on the target level. See
823 * bt_child_highkey_check() for details. So, typically we won't end
824 * up doing anything with low key, but it's simpler for general case
825 * high key verification to always have it available.
826 *
827 * The correctness of managing low key in the case of concurrent
828 * splits wasn't investigated yet. Thankfully we only need low key
829 * for readonly verification and concurrent splits won't happen.
830 */
831 if (state->readonly && !P_RIGHTMOST(opaque))
832 {
833 IndexTuple itup;
834 ItemId itemid;
835
836 itemid = PageGetItemIdCareful(state, state->targetblock,
837 state->target, P_HIKEY);
838 itup = (IndexTuple) PageGetItem(state->target, itemid);
839
840 state->lowkey = MemoryContextAlloc(oldcontext, IndexTupleSize(itup));
841 memcpy(state->lowkey, itup, IndexTupleSize(itup));
842 }
843
844 /* Free page and associated memory for this iteration */
845 MemoryContextReset(state->targetcontext);
846 }
847 while (current != P_NONE);
848
849 if (state->lowkey)
850 {
851 Assert(state->readonly);
852 pfree(state->lowkey);
853 state->lowkey = NULL;
854 }
855
856 /* Don't change context for caller */
857 MemoryContextSwitchTo(oldcontext);
858
859 return nextleveldown;
860}
uint32 BlockNumber
Definition: block.h:31
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
Definition: bufpage.h:354
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
IndexTupleData * IndexTuple
Definition: itup.h:53
static Size IndexTupleSize(const IndexTupleData *itup)
Definition: itup.h:71
void * MemoryContextAlloc(MemoryContext context, Size size)
Definition: mcxt.c:1229
void MemoryContextReset(MemoryContext context)
Definition: mcxt.c:400
void pfree(void *pointer)
Definition: mcxt.c:1594
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:122
#define P_ISLEAF(opaque)
Definition: nbtree.h:221
#define P_HIKEY
Definition: nbtree.h:368
#define BTPageGetOpaque(page)
Definition: nbtree.h:74
#define P_ISDELETED(opaque)
Definition: nbtree.h:223
#define P_FIRSTDATAKEY(opaque)
Definition: nbtree.h:370
#define P_ISROOT(opaque)
Definition: nbtree.h:222
#define P_RIGHTMOST(opaque)
Definition: nbtree.h:220
static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)
Definition: nbtree.h:557
#define P_IGNORE(opaque)
Definition: nbtree.h:226
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
Definition: palloc.h:124
BlockNumber btpo_next
Definition: nbtree.h:66
BlockNumber btpo_prev
Definition: nbtree.h:65
uint32 btpo_level
Definition: nbtree.h:67
Definition: itemid.h:26
static bool bt_leftmost_ignoring_half_dead(BtreeCheckState *state, BlockNumber start, BTPageOpaque start_opaque)
static void bt_target_page_check(BtreeCheckState *state)
static ItemId PageGetItemIdCareful(BtreeCheckState *state, BlockNumber block, Page page, OffsetNumber offset)
static void bt_recheck_sibling_links(BtreeCheckState *state, BlockNumber btpo_prev_from_target, BlockNumber leftcurrent)

References Assert(), bt_leftmost_ignoring_half_dead(), bt_recheck_sibling_links(), bt_target_page_check(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetDownLink(), CHECK_FOR_INTERRUPTS, DEBUG1, elog, ereport, errcode(), errdetail_internal(), errmsg(), errmsg_internal(), ERROR, IndexTupleSize(), InvalidBlockNumber, InvalidBtreeLevel, BtreeLevel::istruerootlevel, BtreeLevel::leftmost, BtreeLevel::level, MemoryContextAlloc(), MemoryContextReset(), MemoryContextSwitchTo(), P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISDELETED, P_ISLEAF, P_ISROOT, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemIdCareful(), PageGetLSN(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_check_every_level().

bt_child_check()

static void bt_child_check ( BtreeCheckStatestate,
BTScanInsert  targetkey,
OffsetNumber  downlinkoffnum 
)
static

Definition at line 2404 of file verify_nbtree.c.

2406{
2407 ItemId itemid;
2408 IndexTuple itup;
2409 BlockNumber childblock;
2410 OffsetNumber offset;
2411 OffsetNumber maxoffset;
2412 Page child;
2413 BTPageOpaque copaque;
2414 BTPageOpaque topaque;
2415
2416 itemid = PageGetItemIdCareful(state, state->targetblock,
2417 state->target, downlinkoffnum);
2418 itup = (IndexTuple) PageGetItem(state->target, itemid);
2419 childblock = BTreeTupleGetDownLink(itup);
2420
2421 /*
2422 * Caller must have ShareLock on target relation, because of
2423 * considerations around page deletion by VACUUM.
2424 *
2425 * NB: In general, page deletion deletes the right sibling's downlink, not
2426 * the downlink of the page being deleted; the deleted page's downlink is
2427 * reused for its sibling. The key space is thereby consolidated between
2428 * the deleted page and its right sibling. (We cannot delete a parent
2429 * page's rightmost child unless it is the last child page, and we intend
2430 * to also delete the parent itself.)
2431 *
2432 * If this verification happened without a ShareLock, the following race
2433 * condition could cause false positives:
2434 *
2435 * In general, concurrent page deletion might occur, including deletion of
2436 * the left sibling of the child page that is examined here. If such a
2437 * page deletion were to occur, closely followed by an insertion into the
2438 * newly expanded key space of the child, a window for the false positive
2439 * opens up: the stale parent/target downlink originally followed to get
2440 * to the child legitimately ceases to be a lower bound on all items in
2441 * the page, since the key space was concurrently expanded "left".
2442 * (Insertion followed the "new" downlink for the child, not our now-stale
2443 * downlink, which was concurrently physically removed in target/parent as
2444 * part of deletion's first phase.)
2445 *
2446 * While we use various techniques elsewhere to perform cross-page
2447 * verification for !readonly callers, a similar trick seems difficult
2448 * here. The tricks used by bt_recheck_sibling_links and by
2449 * bt_right_page_check_scankey both involve verification of a same-level,
2450 * cross-sibling invariant. Cross-level invariants are far more squishy,
2451 * though. The nbtree REDO routines do not actually couple buffer locks
2452 * across levels during page splits, so making any cross-level check work
2453 * reliably in !readonly mode may be impossible.
2454 */
2455 Assert(state->readonly);
2456
2457 /*
2458 * Verify child page has the downlink key from target page (its parent) as
2459 * a lower bound; downlink must be strictly less than all keys on the
2460 * page.
2461 *
2462 * Check all items, rather than checking just the first and trusting that
2463 * the operator class obeys the transitive law.
2464 */
2465 topaque = BTPageGetOpaque(state->target);
2466 child = palloc_btree_page(state, childblock);
2467 copaque = BTPageGetOpaque(child);
2468 maxoffset = PageGetMaxOffsetNumber(child);
2469
2470 /*
2471 * Since we've already loaded the child block, combine this check with
2472 * check for downlink connectivity.
2473 */
2474 bt_child_highkey_check(state, downlinkoffnum,
2475 child, topaque->btpo_level);
2476
2477 /*
2478 * Since there cannot be a concurrent VACUUM operation in readonly mode,
2479 * and since a page has no links within other pages (siblings and parent)
2480 * once it is marked fully deleted, it should be impossible to land on a
2481 * fully deleted page.
2482 *
2483 * It does not quite make sense to enforce that the page cannot even be
2484 * half-dead, despite the fact the downlink is modified at the same stage
2485 * that the child leaf page is marked half-dead. That's incorrect because
2486 * there may occasionally be multiple downlinks from a chain of pages
2487 * undergoing deletion, where multiple successive calls are made to
2488 * _bt_unlink_halfdead_page() by VACUUM before it can finally safely mark
2489 * the leaf page as fully dead. While _bt_mark_page_halfdead() usually
2490 * removes the downlink to the leaf page that is marked half-dead, that's
2491 * not guaranteed, so it's possible we'll land on a half-dead page with a
2492 * downlink due to an interrupted multi-level page deletion.
2493 *
2494 * We go ahead with our checks if the child page is half-dead. It's safe
2495 * to do so because we do not test the child's high key, so it does not
2496 * matter that the original high key will have been replaced by a dummy
2497 * truncated high key within _bt_mark_page_halfdead(). All other page
2498 * items are left intact on a half-dead page, so there is still something
2499 * to test.
2500 */
2501 if (P_ISDELETED(copaque))
2502 ereport(ERROR,
2503 (errcode(ERRCODE_INDEX_CORRUPTED),
2504 errmsg("downlink to deleted page found in index \"%s\"",
2506 errdetail_internal("Parent block=%u child block=%u parent page lsn=%X/%08X.",
2507 state->targetblock, childblock,
2508 LSN_FORMAT_ARGS(state->targetlsn))));
2509
2510 for (offset = P_FIRSTDATAKEY(copaque);
2511 offset <= maxoffset;
2512 offset = OffsetNumberNext(offset))
2513 {
2514 /*
2515 * Skip comparison of target page key against "negative infinity"
2516 * item, if any. Checking it would indicate that it's not a strict
2517 * lower bound, but that's only because of the hard-coding for
2518 * negative infinity items within _bt_compare().
2519 *
2520 * If nbtree didn't truncate negative infinity tuples during internal
2521 * page splits then we'd expect child's negative infinity key to be
2522 * equal to the scankey/downlink from target/parent (it would be a
2523 * "low key" in this hypothetical scenario, and so it would still need
2524 * to be treated as a special case here).
2525 *
2526 * Negative infinity items can be thought of as a strict lower bound
2527 * that works transitively, with the last non-negative-infinity pivot
2528 * followed during a descent from the root as its "true" strict lower
2529 * bound. Only a small number of negative infinity items are truly
2530 * negative infinity; those that are the first items of leftmost
2531 * internal pages. In more general terms, a negative infinity item is
2532 * only negative infinity with respect to the subtree that the page is
2533 * at the root of.
2534 *
2535 * See also: bt_rootdescend(), which can even detect transitive
2536 * inconsistencies on cousin leaf pages.
2537 */
2538 if (offset_is_negative_infinity(copaque, offset))
2539 continue;
2540
2541 if (!invariant_l_nontarget_offset(state, targetkey, childblock, child,
2542 offset))
2543 ereport(ERROR,
2544 (errcode(ERRCODE_INDEX_CORRUPTED),
2545 errmsg("down-link lower bound invariant violated for index \"%s\"",
2547 errdetail_internal("Parent block=%u child index tid=(%u,%u) parent page lsn=%X/%08X.",
2548 state->targetblock, childblock, offset,
2549 LSN_FORMAT_ARGS(state->targetlsn))));
2550 }
2551
2552 pfree(child);
2553}
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition: bufpage.h:372
#define OffsetNumberNext(offsetNumber)
Definition: off.h:52
uint16 OffsetNumber
Definition: off.h:24
static bool offset_is_negative_infinity(BTPageOpaque opaque, OffsetNumber offset)
static void bt_child_highkey_check(BtreeCheckState *state, OffsetNumber target_downlinkoffnum, Page loaded_child, uint32 target_level)
static bool invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, BlockNumber nontargetblock, Page nontarget, OffsetNumber upperbound)
#define LSN_FORMAT_ARGS(lsn)
Definition: xlogdefs.h:46

References Assert(), bt_child_highkey_check(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTreeTupleGetDownLink(), ereport, errcode(), errdetail_internal(), errmsg(), ERROR, invariant_l_nontarget_offset(), LSN_FORMAT_ARGS, offset_is_negative_infinity(), OffsetNumberNext, P_FIRSTDATAKEY, P_ISDELETED, PageGetItem(), PageGetItemIdCareful(), PageGetMaxOffsetNumber(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_target_page_check().

bt_child_highkey_check()

static void bt_child_highkey_check ( BtreeCheckStatestate,
OffsetNumber  target_downlinkoffnum,
Page  loaded_child,
uint32  target_level 
)
static

Definition at line 2157 of file verify_nbtree.c.

2161{
2162 BlockNumber blkno = state->prevrightlink;
2163 Page page;
2164 BTPageOpaque opaque;
2165 bool rightsplit = state->previncompletesplit;
2166 bool first = true;
2167 ItemId itemid;
2168 IndexTuple itup;
2169 BlockNumber downlink;
2170
2171 if (OffsetNumberIsValid(target_downlinkoffnum))
2172 {
2173 itemid = PageGetItemIdCareful(state, state->targetblock,
2174 state->target, target_downlinkoffnum);
2175 itup = (IndexTuple) PageGetItem(state->target, itemid);
2176 downlink = BTreeTupleGetDownLink(itup);
2177 }
2178 else
2179 {
2180 downlink = P_NONE;
2181 }
2182
2183 /*
2184 * If no previous rightlink is memorized for current level just below
2185 * target page's level, we are about to start from the leftmost page. We
2186 * can't follow rightlinks from previous page, because there is no
2187 * previous page. But we still can match high key.
2188 *
2189 * So we initialize variables for the loop above like there is previous
2190 * page referencing current child. Also we imply previous page to not
2191 * have incomplete split flag, that would make us require downlink for
2192 * current child. That's correct, because leftmost page on the level
2193 * should always have parent downlink.
2194 */
2195 if (!BlockNumberIsValid(blkno))
2196 {
2197 blkno = downlink;
2198 rightsplit = false;
2199 }
2200
2201 /* Move to the right on the child level */
2202 while (true)
2203 {
2204 /*
2205 * Did we traverse the whole tree level and this is check for pages to
2206 * the right of rightmost downlink?
2207 */
2208 if (blkno == P_NONE && downlink == P_NONE)
2209 {
2210 state->prevrightlink = InvalidBlockNumber;
2211 state->previncompletesplit = false;
2212 return;
2213 }
2214
2215 /* Did we traverse the whole tree level and don't find next downlink? */
2216 if (blkno == P_NONE)
2217 ereport(ERROR,
2218 (errcode(ERRCODE_INDEX_CORRUPTED),
2219 errmsg("can't traverse from downlink %u to downlink %u of index \"%s\"",
2220 state->prevrightlink, downlink,
2222
2223 /* Load page contents */
2224 if (blkno == downlink && loaded_child)
2225 page = loaded_child;
2226 else
2227 page = palloc_btree_page(state, blkno);
2228
2229 opaque = BTPageGetOpaque(page);
2230
2231 /* The first page we visit at the level should be leftmost */
2232 if (first && !BlockNumberIsValid(state->prevrightlink) &&
2233 !bt_leftmost_ignoring_half_dead(state, blkno, opaque))
2234 ereport(ERROR,
2235 (errcode(ERRCODE_INDEX_CORRUPTED),
2236 errmsg("the first child of leftmost target page is not leftmost of its level in index \"%s\"",
2238 errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.",
2239 state->targetblock, blkno,
2240 LSN_FORMAT_ARGS(state->targetlsn))));
2241
2242 /* Do level sanity check */
2243 if ((!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque)) &&
2244 opaque->btpo_level != target_level - 1)
2245 ereport(ERROR,
2246 (errcode(ERRCODE_INDEX_CORRUPTED),
2247 errmsg("block found while following rightlinks from child of index \"%s\" has invalid level",
2249 errdetail_internal("Block pointed to=%u expected level=%u level in pointed to block=%u.",
2250 blkno, target_level - 1, opaque->btpo_level)));
2251
2252 /* Try to detect circular links */
2253 if ((!first && blkno == state->prevrightlink) || blkno == opaque->btpo_prev)
2254 ereport(ERROR,
2255 (errcode(ERRCODE_INDEX_CORRUPTED),
2256 errmsg("circular link chain found in block %u of index \"%s\"",
2257 blkno, RelationGetRelationName(state->rel))));
2258
2259 if (blkno != downlink && !P_IGNORE(opaque))
2260 {
2261 /* blkno probably has missing parent downlink */
2262 bt_downlink_missing_check(state, rightsplit, blkno, page);
2263 }
2264
2265 rightsplit = P_INCOMPLETE_SPLIT(opaque);
2266
2267 /*
2268 * If we visit page with high key, check that it is equal to the
2269 * target key next to corresponding downlink.
2270 */
2271 if (!rightsplit && !P_RIGHTMOST(opaque))
2272 {
2273 BTPageOpaque topaque;
2274 IndexTuple highkey;
2275 OffsetNumber pivotkey_offset;
2276
2277 /* Get high key */
2278 itemid = PageGetItemIdCareful(state, blkno, page, P_HIKEY);
2279 highkey = (IndexTuple) PageGetItem(page, itemid);
2280
2281 /*
2282 * There might be two situations when we examine high key. If
2283 * current child page is referenced by given target downlink, we
2284 * should look to the next offset number for matching key from
2285 * target page.
2286 *
2287 * Alternatively, we're following rightlinks somewhere in the
2288 * middle between page referenced by previous target's downlink
2289 * and the page referenced by current target's downlink. If
2290 * current child page hasn't incomplete split flag set, then its
2291 * high key should match to the target's key of current offset
2292 * number. This happens when a previous call here (to
2293 * bt_child_highkey_check()) found an incomplete split, and we
2294 * reach a right sibling page without a downlink -- the right
2295 * sibling page's high key still needs to be matched to a
2296 * separator key on the parent/target level.
2297 *
2298 * Don't apply OffsetNumberNext() to target_downlinkoffnum when we
2299 * already had to step right on the child level. Our traversal of
2300 * the child level must try to move in perfect lockstep behind (to
2301 * the left of) the target/parent level traversal.
2302 */
2303 if (blkno == downlink)
2304 pivotkey_offset = OffsetNumberNext(target_downlinkoffnum);
2305 else
2306 pivotkey_offset = target_downlinkoffnum;
2307
2308 topaque = BTPageGetOpaque(state->target);
2309
2310 if (!offset_is_negative_infinity(topaque, pivotkey_offset))
2311 {
2312 /*
2313 * If we're looking for the next pivot tuple in target page,
2314 * but there is no more pivot tuples, then we should match to
2315 * high key instead.
2316 */
2317 if (pivotkey_offset > PageGetMaxOffsetNumber(state->target))
2318 {
2319 if (P_RIGHTMOST(topaque))
2320 ereport(ERROR,
2321 (errcode(ERRCODE_INDEX_CORRUPTED),
2322 errmsg("child high key is greater than rightmost pivot key on target level in index \"%s\"",
2324 errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.",
2325 state->targetblock, blkno,
2326 LSN_FORMAT_ARGS(state->targetlsn))));
2327 pivotkey_offset = P_HIKEY;
2328 }
2329 itemid = PageGetItemIdCareful(state, state->targetblock,
2330 state->target, pivotkey_offset);
2331 itup = (IndexTuple) PageGetItem(state->target, itemid);
2332 }
2333 else
2334 {
2335 /*
2336 * We cannot try to match child's high key to a negative
2337 * infinity key in target, since there is nothing to compare.
2338 * However, it's still possible to match child's high key
2339 * outside of target page. The reason why we're are is that
2340 * bt_child_highkey_check() was previously called for the
2341 * cousin page of 'loaded_child', which is incomplete split.
2342 * So, now we traverse to the right of that cousin page and
2343 * current child level page under consideration still belongs
2344 * to the subtree of target's left sibling. Thus, we need to
2345 * match child's high key to its left uncle page high key.
2346 * Thankfully we saved it, it's called a "low key" of target
2347 * page.
2348 */
2349 if (!state->lowkey)
2350 ereport(ERROR,
2351 (errcode(ERRCODE_INDEX_CORRUPTED),
2352 errmsg("can't find left sibling high key in index \"%s\"",
2354 errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.",
2355 state->targetblock, blkno,
2356 LSN_FORMAT_ARGS(state->targetlsn))));
2357 itup = state->lowkey;
2358 }
2359
2360 if (!bt_pivot_tuple_identical(state->heapkeyspace, highkey, itup))
2361 {
2362 ereport(ERROR,
2363 (errcode(ERRCODE_INDEX_CORRUPTED),
2364 errmsg("mismatch between parent key and child high key in index \"%s\"",
2366 errdetail_internal("Target block=%u child block=%u target page lsn=%X/%08X.",
2367 state->targetblock, blkno,
2368 LSN_FORMAT_ARGS(state->targetlsn))));
2369 }
2370 }
2371
2372 /* Exit if we already found next downlink */
2373 if (blkno == downlink)
2374 {
2375 state->prevrightlink = opaque->btpo_next;
2376 state->previncompletesplit = rightsplit;
2377 return;
2378 }
2379
2380 /* Traverse to the next page using rightlink */
2381 blkno = opaque->btpo_next;
2382
2383 /* Free page contents if it's allocated by us */
2384 if (page != loaded_child)
2385 pfree(page);
2386 first = false;
2387 }
2388}
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
#define P_HAS_FULLXID(opaque)
Definition: nbtree.h:229
#define P_INCOMPLETE_SPLIT(opaque)
Definition: nbtree.h:228
#define OffsetNumberIsValid(offsetNumber)
Definition: off.h:39
static bool bt_pivot_tuple_identical(bool heapkeyspace, IndexTuple itup1, IndexTuple itup2)
static void bt_downlink_missing_check(BtreeCheckState *state, bool rightsplit, BlockNumber blkno, Page page)

References BlockNumberIsValid(), bt_downlink_missing_check(), bt_leftmost_ignoring_half_dead(), bt_pivot_tuple_identical(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BTreeTupleGetDownLink(), ereport, errcode(), errdetail_internal(), errmsg(), ERROR, InvalidBlockNumber, LSN_FORMAT_ARGS, offset_is_negative_infinity(), OffsetNumberIsValid, OffsetNumberNext, P_HAS_FULLXID, P_HIKEY, P_IGNORE, P_INCOMPLETE_SPLIT, P_ISDELETED, P_NONE, P_RIGHTMOST, PageGetItem(), PageGetItemIdCareful(), PageGetMaxOffsetNumber(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_child_check(), and bt_target_page_check().

bt_downlink_missing_check()

static void bt_downlink_missing_check ( BtreeCheckStatestate,
bool  rightsplit,
BlockNumber  blkno,
Page  page 
)
static

Definition at line 2569 of file verify_nbtree.c.

2571{
2572 BTPageOpaque opaque = BTPageGetOpaque(page);
2573 ItemId itemid;
2574 IndexTuple itup;
2575 Page child;
2576 BTPageOpaque copaque;
2577 uint32 level;
2578 BlockNumber childblk;
2579 XLogRecPtr pagelsn;
2580
2581 Assert(state->readonly);
2582 Assert(!P_IGNORE(opaque));
2583
2584 /* No next level up with downlinks to fingerprint from the true root */
2585 if (P_ISROOT(opaque))
2586 return;
2587
2588 pagelsn = PageGetLSN(page);
2589
2590 /*
2591 * Incomplete (interrupted) page splits can account for the lack of a
2592 * downlink. Some inserting transaction should eventually complete the
2593 * page split in passing, when it notices that the left sibling page is
2594 * P_INCOMPLETE_SPLIT().
2595 *
2596 * In general, VACUUM is not prepared for there to be no downlink to a
2597 * page that it deletes. This is the main reason why the lack of a
2598 * downlink can be reported as corruption here. It's not obvious that an
2599 * invalid missing downlink can result in wrong answers to queries,
2600 * though, since index scans that land on the child may end up
2601 * consistently moving right. The handling of concurrent page splits (and
2602 * page deletions) within _bt_moveright() cannot distinguish
2603 * inconsistencies that last for a moment from inconsistencies that are
2604 * permanent and irrecoverable.
2605 *
2606 * VACUUM isn't even prepared to delete pages that have no downlink due to
2607 * an incomplete page split, but it can detect and reason about that case
2608 * by design, so it shouldn't be taken to indicate corruption. See
2609 * _bt_pagedel() for full details.
2610 */
2611 if (rightsplit)
2612 {
2614 (errcode(ERRCODE_NO_DATA),
2615 errmsg_internal("harmless interrupted page split detected in index \"%s\"",
2617 errdetail_internal("Block=%u level=%u left sibling=%u page lsn=%X/%08X.",
2618 blkno, opaque->btpo_level,
2619 opaque->btpo_prev,
2620 LSN_FORMAT_ARGS(pagelsn))));
2621 return;
2622 }
2623
2624 /*
2625 * Page under check is probably the "top parent" of a multi-level page
2626 * deletion. We'll need to descend the subtree to make sure that
2627 * descendant pages are consistent with that, though.
2628 *
2629 * If the page (which must be non-ignorable) is a leaf page, then clearly
2630 * it can't be the top parent. The lack of a downlink is probably a
2631 * symptom of a broad problem that could just as easily cause
2632 * inconsistencies anywhere else.
2633 */
2634 if (P_ISLEAF(opaque))
2635 ereport(ERROR,
2636 (errcode(ERRCODE_INDEX_CORRUPTED),
2637 errmsg("leaf index block lacks downlink in index \"%s\"",
2639 errdetail_internal("Block=%u page lsn=%X/%08X.",
2640 blkno,
2641 LSN_FORMAT_ARGS(pagelsn))));
2642
2643 /* Descend from the given page, which is an internal page */
2644 elog(DEBUG1, "checking for interrupted multi-level deletion due to missing downlink in index \"%s\"",
2646
2647 level = opaque->btpo_level;
2648 itemid = PageGetItemIdCareful(state, blkno, page, P_FIRSTDATAKEY(opaque));
2649 itup = (IndexTuple) PageGetItem(page, itemid);
2650 childblk = BTreeTupleGetDownLink(itup);
2651 for (;;)
2652 {
2654
2655 child = palloc_btree_page(state, childblk);
2656 copaque = BTPageGetOpaque(child);
2657
2658 if (P_ISLEAF(copaque))
2659 break;
2660
2661 /* Do an extra sanity check in passing on internal pages */
2662 if (copaque->btpo_level != level - 1)
2663 ereport(ERROR,
2664 (errcode(ERRCODE_INDEX_CORRUPTED),
2665 errmsg_internal("downlink points to block in index \"%s\" whose level is not one level down",
2667 errdetail_internal("Top parent/under check block=%u block pointed to=%u expected level=%u level in pointed to block=%u.",
2668 blkno, childblk,
2669 level - 1, copaque->btpo_level)));
2670
2671 level = copaque->btpo_level;
2672 itemid = PageGetItemIdCareful(state, childblk, child,
2673 P_FIRSTDATAKEY(copaque));
2674 itup = (IndexTuple) PageGetItem(child, itemid);
2675 childblk = BTreeTupleGetDownLink(itup);
2676 /* Be slightly more pro-active in freeing this memory, just in case */
2677 pfree(child);
2678 }
2679
2680 /*
2681 * Since there cannot be a concurrent VACUUM operation in readonly mode,
2682 * and since a page has no links within other pages (siblings and parent)
2683 * once it is marked fully deleted, it should be impossible to land on a
2684 * fully deleted page. See bt_child_check() for further details.
2685 *
2686 * The bt_child_check() P_ISDELETED() check is repeated here because
2687 * bt_child_check() does not visit pages reachable through negative
2688 * infinity items. Besides, bt_child_check() is unwilling to descend
2689 * multiple levels. (The similar bt_child_check() P_ISDELETED() check
2690 * within bt_check_level_from_leftmost() won't reach the page either,
2691 * since the leaf's live siblings should have their sibling links updated
2692 * to bypass the deletion target page when it is marked fully dead.)
2693 *
2694 * If this error is raised, it might be due to a previous multi-level page
2695 * deletion that failed to realize that it wasn't yet safe to mark the
2696 * leaf page as fully dead. A "dangling downlink" will still remain when
2697 * this happens. The fact that the dangling downlink's page (the leaf's
2698 * parent/ancestor page) lacked a downlink is incidental.
2699 */
2700 if (P_ISDELETED(copaque))
2701 ereport(ERROR,
2702 (errcode(ERRCODE_INDEX_CORRUPTED),
2703 errmsg_internal("downlink to deleted leaf page found in index \"%s\"",
2705 errdetail_internal("Top parent/target block=%u leaf block=%u top parent/under check lsn=%X/%08X.",
2706 blkno, childblk,
2707 LSN_FORMAT_ARGS(pagelsn))));
2708
2709 /*
2710 * Iff leaf page is half-dead, its high key top parent link should point
2711 * to what VACUUM considered to be the top parent page at the instant it
2712 * was interrupted. Provided the high key link actually points to the
2713 * page under check, the missing downlink we detected is consistent with
2714 * there having been an interrupted multi-level page deletion. This means
2715 * that the subtree with the page under check at its root (a page deletion
2716 * chain) is in a consistent state, enabling VACUUM to resume deleting the
2717 * entire chain the next time it encounters the half-dead leaf page.
2718 */
2719 if (P_ISHALFDEAD(copaque) && !P_RIGHTMOST(copaque))
2720 {
2721 itemid = PageGetItemIdCareful(state, childblk, child, P_HIKEY);
2722 itup = (IndexTuple) PageGetItem(child, itemid);
2723 if (BTreeTupleGetTopParent(itup) == blkno)
2724 return;
2725 }
2726
2727 ereport(ERROR,
2728 (errcode(ERRCODE_INDEX_CORRUPTED),
2729 errmsg("internal index block lacks downlink in index \"%s\"",
2731 errdetail_internal("Block=%u level=%u page lsn=%X/%08X.",
2732 blkno, opaque->btpo_level,
2733 LSN_FORMAT_ARGS(pagelsn))));
2734}
#define P_ISHALFDEAD(opaque)
Definition: nbtree.h:225
static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)
Definition: nbtree.h:621
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_prev, BTreeTupleGetDownLink(), BTreeTupleGetTopParent(), CHECK_FOR_INTERRUPTS, DEBUG1, elog, ereport, errcode(), errdetail_internal(), errmsg(), errmsg_internal(), ERROR, LSN_FORMAT_ARGS, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISROOT, P_RIGHTMOST, PageGetItem(), PageGetItemIdCareful(), PageGetLSN(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_child_highkey_check().

bt_entry_unique_check()

static void bt_entry_unique_check ( BtreeCheckStatestate,
IndexTuple  itup,
BlockNumber  targetblock,
OffsetNumber  offset,
)
static

Definition at line 923 of file verify_nbtree.c.

926{
927 ItemPointer tid;
928 bool has_visible_entry = false;
929
930 Assert(targetblock != P_NONE);
931
932 /*
933 * Current tuple has posting list. Report duplicate if TID of any posting
934 * list entry is visible and lVis->tid is valid.
935 */
936 if (BTreeTupleIsPosting(itup))
937 {
938 for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
939 {
940 tid = BTreeTupleGetPostingN(itup, i);
942 {
943 has_visible_entry = true;
944 if (ItemPointerIsValid(lVis->tid))
945 {
947 lVis,
948 tid, targetblock,
949 offset, i);
950 }
951
952 /*
953 * Prevent double reporting unique constraint violation
954 * between the posting list entries of the first tuple on the
955 * page after cross-page check.
956 */
957 if (lVis->blkno != targetblock && ItemPointerIsValid(lVis->tid))
958 return;
959
960 lVis->blkno = targetblock;
961 lVis->offset = offset;
962 lVis->postingIndex = i;
963 lVis->tid = tid;
964 }
965 }
966 }
967
968 /*
969 * Current tuple has no posting list. If TID is visible save info about it
970 * for the next comparisons in the loop in bt_target_page_check(). Report
971 * duplicate if lVis->tid is already valid.
972 */
973 else
974 {
975 tid = BTreeTupleGetHeapTID(itup);
977 {
978 has_visible_entry = true;
979 if (ItemPointerIsValid(lVis->tid))
980 {
982 lVis,
983 tid, targetblock,
984 offset, -1);
985 }
986
987 lVis->blkno = targetblock;
988 lVis->offset = offset;
989 lVis->tid = tid;
990 lVis->postingIndex = -1;
991 }
992 }
993
994 if (!has_visible_entry &&
995 lVis->blkno != InvalidBlockNumber &&
996 lVis->blkno != targetblock)
997 {
998 char *posting = "";
999
1000 if (lVis->postingIndex >= 0)
1001 posting = psprintf(" posting %u", lVis->postingIndex);
1003 (errcode(ERRCODE_NO_DATA),
1004 errmsg("index uniqueness can not be checked for index tid=(%u,%u) in index \"%s\"",
1005 targetblock, offset,
1007 errdetail("It doesn't have visible heap tids and key is equal to the tid=(%u,%u)%s (points to heap tid=(%u,%u)).",
1008 lVis->blkno, lVis->offset, posting,
1011 errhint("VACUUM the table and repeat the check.")));
1012 }
1013}
int errdetail(const char *fmt,...)
Definition: elog.c:1207
i
int i
Definition: isn.c:77
static OffsetNumber ItemPointerGetOffsetNumberNoCheck(const ItemPointerData *pointer)
Definition: itemptr.h:114
static BlockNumber ItemPointerGetBlockNumberNoCheck(const ItemPointerData *pointer)
Definition: itemptr.h:93
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition: itemptr.h:83
static uint16 BTreeTupleGetNPosting(IndexTuple posting)
Definition: nbtree.h:519
static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)
Definition: nbtree.h:545
static bool BTreeTupleIsPosting(IndexTuple itup)
Definition: nbtree.h:493
static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)
Definition: nbtree.h:639
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43
OffsetNumber offset
Definition: verify_nbtree.c:157
static void bt_report_duplicate(BtreeCheckState *state, BtreeLastVisibleEntry *lVis, ItemPointer nexttid, BlockNumber nblock, OffsetNumber noffset, int nposting)
Definition: verify_nbtree.c:883
static bool heap_entry_is_visible(BtreeCheckState *state, ItemPointer tid)
Definition: verify_nbtree.c:864

References Assert(), BtreeLastVisibleEntry::blkno, bt_report_duplicate(), BTreeTupleGetHeapTID(), BTreeTupleGetNPosting(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), DEBUG1, ereport, errcode(), errdetail(), errhint(), errmsg(), heap_entry_is_visible(), i, InvalidBlockNumber, ItemPointerGetBlockNumberNoCheck(), ItemPointerGetOffsetNumberNoCheck(), ItemPointerIsValid(), BtreeLastVisibleEntry::offset, P_NONE, BtreeLastVisibleEntry::postingIndex, psprintf(), RelationGetRelationName, and BtreeLastVisibleEntry::tid.

Referenced by bt_target_page_check().

bt_index_check()

Datum bt_index_check ( PG_FUNCTION_ARGS  )

Definition at line 250 of file verify_nbtree.c.

251{
252 Oid indrelid = PG_GETARG_OID(0);
254
255 args.heapallindexed = false;
256 args.rootdescend = false;
257 args.parentcheck = false;
258 args.checkunique = false;
259
260 if (PG_NARGS() >= 2)
261 args.heapallindexed = PG_GETARG_BOOL(1);
262 if (PG_NARGS() >= 3)
263 args.checkunique = PG_GETARG_BOOL(2);
264
265 amcheck_lock_relation_and_check(indrelid, BTREE_AM_OID,
268
270}
#define PG_RETURN_VOID()
Definition: fmgr.h:349
#define PG_GETARG_OID(n)
Definition: fmgr.h:275
#define PG_NARGS()
Definition: fmgr.h:203
#define PG_GETARG_BOOL(n)
Definition: fmgr.h:274
#define AccessShareLock
Definition: lockdefs.h:36
unsigned int Oid
Definition: postgres_ext.h:32
void amcheck_lock_relation_and_check(Oid indrelid, Oid am_id, IndexDoCheckCallback check, LOCKMODE lockmode, void *state)
Definition: verify_common.c:62
static void bt_index_check_callback(Relation indrel, Relation heaprel, void *state, bool readonly)
Definition: verify_nbtree.c:310

References AccessShareLock, amcheck_lock_relation_and_check(), generate_unaccent_rules::args, bt_index_check_callback(), PG_GETARG_BOOL, PG_GETARG_OID, PG_NARGS, and PG_RETURN_VOID.

bt_index_check_callback()

static void bt_index_check_callback ( Relation  indrel,
Relation  heaprel,
void *  state,
bool  readonly 
)
static

Definition at line 310 of file verify_nbtree.c.

311{
313 bool heapkeyspace,
314 allequalimage;
315
318 (errcode(ERRCODE_INDEX_CORRUPTED),
319 errmsg("index \"%s\" lacks a main relation fork",
320 RelationGetRelationName(indrel))));
321
322 /* Extract metadata from metapage, and sanitize it in passing */
323 _bt_metaversion(indrel, &heapkeyspace, &allequalimage);
324 if (allequalimage && !heapkeyspace)
326 (errcode(ERRCODE_INDEX_CORRUPTED),
327 errmsg("index \"%s\" metapage has equalimage field set on unsupported nbtree version",
328 RelationGetRelationName(indrel))));
329 if (allequalimage && !_bt_allequalimage(indrel, false))
330 {
331 bool has_interval_ops = false;
332
333 for (int i = 0; i < IndexRelationGetNumberOfKeyAttributes(indrel); i++)
334 if (indrel->rd_opfamily[i] == INTERVAL_BTREE_FAM_OID)
335 {
336 has_interval_ops = true;
338 (errcode(ERRCODE_INDEX_CORRUPTED),
339 errmsg("index \"%s\" metapage incorrectly indicates that deduplication is safe",
341 has_interval_ops
342 ? errhint("This is known of \"interval\" indexes last built on a version predating 2023-11.")
343 : 0));
344 }
345 }
346
347 /* Check index, possibly against table it is an index on */
348 bt_check_every_level(indrel, heaprel, heapkeyspace, readonly,
349 args->heapallindexed, args->rootdescend, args->checkunique);
350}
void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)
Definition: nbtpage.c:739
bool _bt_allequalimage(Relation rel, bool debugmessage)
Definition: nbtutils.c:4327
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:576
#define IndexRelationGetNumberOfKeyAttributes(relation)
Definition: rel.h:533
@ MAIN_FORKNUM
Definition: relpath.h:58
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
Oid * rd_opfamily
Definition: rel.h:207
static void bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, bool readonly, bool heapallindexed, bool rootdescend, bool checkunique)
Definition: verify_nbtree.c:376

References _bt_allequalimage(), _bt_metaversion(), generate_unaccent_rules::args, bt_check_every_level(), ereport, errcode(), errhint(), errmsg(), ERROR, i, IndexRelationGetNumberOfKeyAttributes, MAIN_FORKNUM, RelationData::rd_opfamily, RelationGetRelationName, RelationGetSmgr(), and smgrexists().

Referenced by bt_index_check(), and bt_index_parent_check().

bt_index_parent_check()

Datum bt_index_parent_check ( PG_FUNCTION_ARGS  )

Definition at line 282 of file verify_nbtree.c.

283{
284 Oid indrelid = PG_GETARG_OID(0);
286
287 args.heapallindexed = false;
288 args.rootdescend = false;
289 args.parentcheck = true;
290 args.checkunique = false;
291
292 if (PG_NARGS() >= 2)
293 args.heapallindexed = PG_GETARG_BOOL(1);
294 if (PG_NARGS() >= 3)
295 args.rootdescend = PG_GETARG_BOOL(2);
296 if (PG_NARGS() >= 4)
297 args.checkunique = PG_GETARG_BOOL(3);
298
299 amcheck_lock_relation_and_check(indrelid, BTREE_AM_OID,
301 ShareLock, &args);
302
304}
#define ShareLock
Definition: lockdefs.h:40

References amcheck_lock_relation_and_check(), generate_unaccent_rules::args, bt_index_check_callback(), PG_GETARG_BOOL, PG_GETARG_OID, PG_NARGS, PG_RETURN_VOID, and ShareLock.

bt_leftmost_ignoring_half_dead()

static bool bt_leftmost_ignoring_half_dead ( BtreeCheckStatestate,
BlockNumber  start,
BTPageOpaque  start_opaque 
)
static

Definition at line 1022 of file verify_nbtree.c.

1025{
1026 BlockNumber reached = start_opaque->btpo_prev,
1027 reached_from = start;
1028 bool all_half_dead = true;
1029
1030 /*
1031 * To handle the !readonly case, we'd need to accept BTP_DELETED pages and
1032 * potentially observe nbtree/README "Page deletion and backwards scans".
1033 */
1034 Assert(state->readonly);
1035
1036 while (reached != P_NONE && all_half_dead)
1037 {
1038 Page page = palloc_btree_page(state, reached);
1039 BTPageOpaque reached_opaque = BTPageGetOpaque(page);
1040
1042
1043 /*
1044 * Try to detect btpo_prev circular links. _bt_unlink_halfdead_page()
1045 * writes that side-links will continue to point to the siblings.
1046 * Check btpo_next for that property.
1047 */
1048 all_half_dead = P_ISHALFDEAD(reached_opaque) &&
1049 reached != start &&
1050 reached != reached_from &&
1051 reached_opaque->btpo_next == reached_from;
1052 if (all_half_dead)
1053 {
1054 XLogRecPtr pagelsn = PageGetLSN(page);
1055
1056 /* pagelsn should point to an XLOG_BTREE_MARK_PAGE_HALFDEAD */
1058 (errcode(ERRCODE_NO_DATA),
1059 errmsg_internal("harmless interrupted page deletion detected in index \"%s\"",
1061 errdetail_internal("Block=%u right block=%u page lsn=%X/%08X.",
1062 reached, reached_from,
1063 LSN_FORMAT_ARGS(pagelsn))));
1064
1065 reached_from = reached;
1066 reached = reached_opaque->btpo_prev;
1067 }
1068
1069 pfree(page);
1070 }
1071
1072 return all_half_dead;
1073}
return str start

References Assert(), BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, CHECK_FOR_INTERRUPTS, DEBUG1, ereport, errcode(), errdetail_internal(), errmsg_internal(), LSN_FORMAT_ARGS, P_ISHALFDEAD, P_NONE, PageGetLSN(), palloc_btree_page(), pfree(), RelationGetRelationName, and start.

Referenced by bt_check_level_from_leftmost(), and bt_child_highkey_check().

bt_mkscankey_pivotsearch()

static BTScanInsert bt_mkscankey_pivotsearch ( Relation  rel,
IndexTuple  itup 
)
inlinestatic

Definition at line 3480 of file verify_nbtree.c.

3481{
3482 BTScanInsert skey;
3483
3484 skey = _bt_mkscankey(rel, itup);
3485 skey->backward = true;
3486
3487 return skey;
3488}
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
Definition: nbtutils.c:97
bool backward
Definition: nbtree.h:801

References _bt_mkscankey(), and BTScanInsertData::backward.

Referenced by bt_right_page_check_scankey(), and bt_target_page_check().

bt_normalize_tuple()

static IndexTuple bt_normalize_tuple ( BtreeCheckStatestate,
IndexTuple  itup 
)
static

Definition at line 2860 of file verify_nbtree.c.

2861{
2862 TupleDesc tupleDescriptor = RelationGetDescr(state->rel);
2863 Datum normalized[INDEX_MAX_KEYS];
2864 bool isnull[INDEX_MAX_KEYS];
2865 bool need_free[INDEX_MAX_KEYS];
2866 bool formnewtup = false;
2867 IndexTuple reformed;
2868 int i;
2869
2870 /* Caller should only pass "logical" non-pivot tuples here */
2872
2873 /* Easy case: It's immediately clear that tuple has no varlena datums */
2874 if (!IndexTupleHasVarwidths(itup))
2875 return itup;
2876
2877 for (i = 0; i < tupleDescriptor->natts; i++)
2878 {
2880
2881 att = TupleDescAttr(tupleDescriptor, i);
2882
2883 /* Assume untoasted/already normalized datum initially */
2884 need_free[i] = false;
2885 normalized[i] = index_getattr(itup, att->attnum,
2886 tupleDescriptor,
2887 &isnull[i]);
2888 if (att->attbyval || att->attlen != -1 || isnull[i])
2889 continue;
2890
2891 /*
2892 * Callers always pass a tuple that could safely be inserted into the
2893 * index without further processing, so an external varlena header
2894 * should never be encountered here
2895 */
2896 if (VARATT_IS_EXTERNAL(DatumGetPointer(normalized[i])))
2897 ereport(ERROR,
2898 (errcode(ERRCODE_INDEX_CORRUPTED),
2899 errmsg("external varlena datum in tuple that references heap row (%u,%u) in index \"%s\"",
2903 else if (!VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i])) &&
2904 VARSIZE(DatumGetPointer(normalized[i])) > TOAST_INDEX_TARGET &&
2905 (att->attstorage == TYPSTORAGE_EXTENDED ||
2906 att->attstorage == TYPSTORAGE_MAIN))
2907 {
2908 /*
2909 * This value will be compressed by index_form_tuple() with the
2910 * current storage settings. We may be here because this tuple
2911 * was formed with different storage settings. So, force forming.
2912 */
2913 formnewtup = true;
2914 }
2915 else if (VARATT_IS_COMPRESSED(DatumGetPointer(normalized[i])))
2916 {
2917 formnewtup = true;
2918 normalized[i] = PointerGetDatum(PG_DETOAST_DATUM(normalized[i]));
2919 need_free[i] = true;
2920 }
2921
2922 /*
2923 * Short tuples may have 1B or 4B header. Convert 4B header of short
2924 * tuples to 1B
2925 */
2926 else if (VARATT_CAN_MAKE_SHORT(DatumGetPointer(normalized[i])))
2927 {
2928 /* convert to short varlena */
2930 char *data = palloc(len);
2931
2933 memcpy(data + 1, VARDATA(DatumGetPointer(normalized[i])), len - 1);
2934
2935 formnewtup = true;
2936 normalized[i] = PointerGetDatum(data);
2937 need_free[i] = true;
2938 }
2939 }
2940
2941 /*
2942 * Easier case: Tuple has varlena datums, none of which are compressed or
2943 * short with 4B header
2944 */
2945 if (!formnewtup)
2946 return itup;
2947
2948 /*
2949 * Hard case: Tuple had compressed varlena datums that necessitate
2950 * creating normalized version of the tuple from uncompressed input datums
2951 * (normalized input datums). This is rather naive, but shouldn't be
2952 * necessary too often.
2953 *
2954 * In the heap, tuples may contain short varlena datums with both 1B
2955 * header and 4B headers. But the corresponding index tuple should always
2956 * have such varlena's with 1B headers. So, if there is a short varlena
2957 * with 4B header, we need to convert it for fingerprinting.
2958 *
2959 * Note that we rely on deterministic index_form_tuple() TOAST compression
2960 * of normalized input.
2961 */
2962 reformed = index_form_tuple(tupleDescriptor, normalized, isnull);
2963 reformed->t_tid = itup->t_tid;
2964
2965 /* Cannot leak memory here */
2966 for (i = 0; i < tupleDescriptor->natts; i++)
2967 if (need_free[i])
2968 pfree(DatumGetPointer(normalized[i]));
2969
2970 return reformed;
2971}
size_t Size
Definition: c.h:610
#define PG_DETOAST_DATUM(datum)
Definition: fmgr.h:240
#define TOAST_INDEX_TARGET
Definition: heaptoast.h:68
IndexTuple index_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition: indextuple.c:44
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition: itemptr.h:124
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition: itemptr.h:103
static bool IndexTupleHasVarwidths(const IndexTupleData *itup)
Definition: itup.h:83
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
Definition: itup.h:131
void * palloc(Size size)
Definition: mcxt.c:1365
static bool BTreeTupleIsPivot(IndexTuple itup)
Definition: nbtree.h:481
FormData_pg_attribute * Form_pg_attribute
Definition: pg_attribute.h:202
#define INDEX_MAX_KEYS
const void size_t len
const void * data
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:332
uint64_t Datum
Definition: postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
#define RelationGetDescr(relation)
Definition: rel.h:540
ItemPointerData t_tid
Definition: itup.h:37
int natts
Definition: tupdesc.h:137
static FormData_pg_attribute * TupleDescAttr(TupleDesc tupdesc, int i)
Definition: tupdesc.h:160
static bool VARATT_CAN_MAKE_SHORT(const void *PTR)
Definition: varatt.h:417
static bool VARATT_IS_EXTERNAL(const void *PTR)
Definition: varatt.h:354
static Size VARSIZE(const void *PTR)
Definition: varatt.h:298
static char * VARDATA(const void *PTR)
Definition: varatt.h:305
static Size VARATT_CONVERTED_SHORT_SIZE(const void *PTR)
Definition: varatt.h:425
static bool VARATT_IS_COMPRESSED(const void *PTR)
Definition: varatt.h:347
static void SET_VARSIZE_SHORT(void *PTR, Size len)
Definition: varatt.h:439

References Assert(), BTreeTupleIsPivot(), BTreeTupleIsPosting(), data, DatumGetPointer(), ereport, errcode(), errmsg(), ERROR, i, index_form_tuple(), index_getattr(), INDEX_MAX_KEYS, IndexTupleHasVarwidths(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), len, TupleDescData::natts, palloc(), pfree(), PG_DETOAST_DATUM, PointerGetDatum(), RelationGetDescr, RelationGetRelationName, SET_VARSIZE_SHORT(), IndexTupleData::t_tid, TOAST_INDEX_TARGET, TupleDescAttr(), VARATT_CAN_MAKE_SHORT(), VARATT_CONVERTED_SHORT_SIZE(), VARATT_IS_COMPRESSED(), VARATT_IS_EXTERNAL(), VARDATA(), and VARSIZE().

Referenced by bt_target_page_check(), and bt_tuple_present_callback().

bt_pivot_tuple_identical()

static bool bt_pivot_tuple_identical ( bool  heapkeyspace,
IndexTuple  itup1,
IndexTuple  itup2 
)
static

Definition at line 2084 of file verify_nbtree.c.

2085{
2086 if (IndexTupleSize(itup1) != IndexTupleSize(itup2))
2087 return false;
2088
2089 if (heapkeyspace)
2090 {
2091 /*
2092 * Offset number will contain important information in heapkeyspace
2093 * indexes: the number of attributes left in the pivot tuple following
2094 * suffix truncation. Don't skip over it (compare it too).
2095 */
2096 if (memcmp(&itup1->t_tid.ip_posid, &itup2->t_tid.ip_posid,
2097 IndexTupleSize(itup1) -
2098 offsetof(ItemPointerData, ip_posid)) != 0)
2099 return false;
2100 }
2101 else
2102 {
2103 /*
2104 * Cannot rely on offset number field having consistent value across
2105 * levels on pg_upgrade'd !heapkeyspace indexes. Compare contents of
2106 * tuple starting from just after item pointer (i.e. after block
2107 * number and offset number).
2108 */
2109 if (memcmp(&itup1->t_info, &itup2->t_info,
2110 IndexTupleSize(itup1) -
2111 offsetof(IndexTupleData, t_info)) != 0)
2112 return false;
2113 }
2114
2115 return true;
2116}
unsigned short t_info
Definition: itup.h:49
OffsetNumber ip_posid
Definition: itemptr.h:39

References IndexTupleSize(), ItemPointerData::ip_posid, IndexTupleData::t_info, and IndexTupleData::t_tid.

Referenced by bt_child_highkey_check().

bt_posting_plain_tuple()

static IndexTuple bt_posting_plain_tuple ( IndexTuple  itup,
int  n 
)
inlinestatic

Definition at line 2988 of file verify_nbtree.c.

2989{
2991
2992 /* Returns non-posting-list tuple */
2993 return _bt_form_posting(itup, BTreeTupleGetPostingN(itup, n), 1);
2994}
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
Definition: nbtdedup.c:865

References _bt_form_posting(), Assert(), BTreeTupleGetPostingN(), and BTreeTupleIsPosting().

Referenced by bt_target_page_check().

bt_recheck_sibling_links()

static void bt_recheck_sibling_links ( BtreeCheckStatestate,
BlockNumber  btpo_prev_from_target,
BlockNumber  leftcurrent 
)
static

Definition at line 1111 of file verify_nbtree.c.

1114{
1115 /* passing metapage to BTPageGetOpaque() would give irrelevant findings */
1116 Assert(leftcurrent != P_NONE);
1117
1118 if (!state->readonly)
1119 {
1120 Buffer lbuf;
1121 Buffer newtargetbuf;
1122 Page page;
1123 BTPageOpaque opaque;
1124 BlockNumber newtargetblock;
1125
1126 /* Couple locks in the usual order for nbtree: Left to right */
1127 lbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM, leftcurrent,
1128 RBM_NORMAL, state->checkstrategy);
1129 LockBuffer(lbuf, BT_READ);
1130 _bt_checkpage(state->rel, lbuf);
1131 page = BufferGetPage(lbuf);
1132 opaque = BTPageGetOpaque(page);
1133 if (P_ISDELETED(opaque))
1134 {
1135 /*
1136 * Cannot reason about concurrently deleted page -- the left link
1137 * in the page to the right is expected to point to some other
1138 * page to the left (not leftcurrent page).
1139 *
1140 * Note that we deliberately don't give up with a half-dead page.
1141 */
1142 UnlockReleaseBuffer(lbuf);
1143 return;
1144 }
1145
1146 newtargetblock = opaque->btpo_next;
1147 /* Avoid self-deadlock when newtargetblock == leftcurrent */
1148 if (newtargetblock != leftcurrent)
1149 {
1150 newtargetbuf = ReadBufferExtended(state->rel, MAIN_FORKNUM,
1151 newtargetblock, RBM_NORMAL,
1152 state->checkstrategy);
1153 LockBuffer(newtargetbuf, BT_READ);
1154 _bt_checkpage(state->rel, newtargetbuf);
1155 page = BufferGetPage(newtargetbuf);
1156 opaque = BTPageGetOpaque(page);
1157 /* btpo_prev_from_target may have changed; update it */
1158 btpo_prev_from_target = opaque->btpo_prev;
1159 }
1160 else
1161 {
1162 /*
1163 * leftcurrent right sibling points back to leftcurrent block.
1164 * Index is corrupt. Easiest way to handle this is to pretend
1165 * that we actually read from a distinct page that has an invalid
1166 * block number in its btpo_prev.
1167 */
1168 newtargetbuf = InvalidBuffer;
1169 btpo_prev_from_target = InvalidBlockNumber;
1170 }
1171
1172 /*
1173 * No need to check P_ISDELETED here, since new target block cannot be
1174 * marked deleted as long as we hold a lock on lbuf
1175 */
1176 if (BufferIsValid(newtargetbuf))
1177 UnlockReleaseBuffer(newtargetbuf);
1178 UnlockReleaseBuffer(lbuf);
1179
1180 if (btpo_prev_from_target == leftcurrent)
1181 {
1182 /* Report split in left sibling, not target (or new target) */
1184 (errcode(ERRCODE_INTERNAL_ERROR),
1185 errmsg_internal("harmless concurrent page split detected in index \"%s\"",
1187 errdetail_internal("Block=%u new right sibling=%u original right sibling=%u.",
1188 leftcurrent, newtargetblock,
1189 state->targetblock)));
1190 return;
1191 }
1192
1193 /*
1194 * Index is corrupt. Make sure that we report correct target page.
1195 *
1196 * This could have changed in cases where there was a concurrent page
1197 * split, as well as index corruption (at least in theory). Note that
1198 * btpo_prev_from_target was already updated above.
1199 */
1200 state->targetblock = newtargetblock;
1201 }
1202
1203 ereport(ERROR,
1204 (errcode(ERRCODE_INDEX_CORRUPTED),
1205 errmsg("left link/right link pair in index \"%s\" not in agreement",
1207 errdetail_internal("Block=%u left block=%u left link from block=%u.",
1208 state->targetblock, leftcurrent,
1209 btpo_prev_from_target)));
1210}
int Buffer
Definition: buf.h:23
#define InvalidBuffer
Definition: buf.h:25
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5355
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5572
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:805
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:417
@ RBM_NORMAL
Definition: bufmgr.h:46
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:368
void _bt_checkpage(Relation rel, Buffer buf)
Definition: nbtpage.c:797
#define BT_READ
Definition: nbtree.h:730

References _bt_checkpage(), Assert(), BT_READ, BTPageGetOpaque, BTPageOpaqueData::btpo_next, BTPageOpaqueData::btpo_prev, BufferGetPage(), BufferIsValid(), DEBUG1, ereport, errcode(), errdetail_internal(), errmsg(), errmsg_internal(), ERROR, InvalidBlockNumber, InvalidBuffer, LockBuffer(), MAIN_FORKNUM, P_ISDELETED, P_NONE, RBM_NORMAL, ReadBufferExtended(), RelationGetRelationName, and UnlockReleaseBuffer().

Referenced by bt_check_level_from_leftmost().

bt_report_duplicate()

static void bt_report_duplicate ( BtreeCheckStatestate,
ItemPointer  nexttid,
BlockNumber  nblock,
OffsetNumber  noffset,
int  nposting 
)
static

Definition at line 883 of file verify_nbtree.c.

887{
888 char *htid,
889 *nhtid,
890 *itid,
891 *nitid = "",
892 *pposting = "",
893 *pnposting = "";
894
895 htid = psprintf("tid=(%u,%u)",
898 nhtid = psprintf("tid=(%u,%u)",
901 itid = psprintf("tid=(%u,%u)", lVis->blkno, lVis->offset);
902
903 if (nblock != lVis->blkno || noffset != lVis->offset)
904 nitid = psprintf(" tid=(%u,%u)", nblock, noffset);
905
906 if (lVis->postingIndex >= 0)
907 pposting = psprintf(" posting %u", lVis->postingIndex);
908
909 if (nposting >= 0)
910 pnposting = psprintf(" posting %u", nposting);
911
913 (errcode(ERRCODE_INDEX_CORRUPTED),
914 errmsg("index uniqueness is violated for index \"%s\"",
916 errdetail("Index %s%s and%s%s (point to heap %s and %s) page lsn=%X/%08X.",
917 itid, pposting, nitid, pnposting, htid, nhtid,
918 LSN_FORMAT_ARGS(state->targetlsn))));
919}

References BtreeLastVisibleEntry::blkno, ereport, errcode(), errdetail(), errmsg(), ERROR, ItemPointerGetBlockNumberNoCheck(), ItemPointerGetOffsetNumberNoCheck(), LSN_FORMAT_ARGS, BtreeLastVisibleEntry::offset, BtreeLastVisibleEntry::postingIndex, psprintf(), RelationGetRelationName, and BtreeLastVisibleEntry::tid.

Referenced by bt_entry_unique_check().

bt_right_page_check_scankey()

static BTScanInsert bt_right_page_check_scankey ( BtreeCheckStatestate,
OffsetNumberrightfirstoffset 
)
static

Definition at line 1877 of file verify_nbtree.c.

1878{
1879 BTPageOpaque opaque;
1880 ItemId rightitem;
1881 IndexTuple firstitup;
1882 BlockNumber targetnext;
1883 Page rightpage;
1884 OffsetNumber nline;
1885
1886 /* Determine target's next block number */
1887 opaque = BTPageGetOpaque(state->target);
1888
1889 /* If target is already rightmost, no right sibling; nothing to do here */
1890 if (P_RIGHTMOST(opaque))
1891 return NULL;
1892
1893 /*
1894 * General notes on concurrent page splits and page deletion:
1895 *
1896 * Routines like _bt_search() don't require *any* page split interlock
1897 * when descending the tree, including something very light like a buffer
1898 * pin. That's why it's okay that we don't either. This avoidance of any
1899 * need to "couple" buffer locks is the raison d' etre of the Lehman & Yao
1900 * algorithm, in fact.
1901 *
1902 * That leaves deletion. A deleted page won't actually be recycled by
1903 * VACUUM early enough for us to fail to at least follow its right link
1904 * (or left link, or downlink) and find its sibling, because recycling
1905 * does not occur until no possible index scan could land on the page.
1906 * Index scans can follow links with nothing more than their snapshot as
1907 * an interlock and be sure of at least that much. (See page
1908 * recycling/"visible to everyone" notes in nbtree README.)
1909 *
1910 * Furthermore, it's okay if we follow a rightlink and find a half-dead or
1911 * dead (ignorable) page one or more times. There will either be a
1912 * further right link to follow that leads to a live page before too long
1913 * (before passing by parent's rightmost child), or we will find the end
1914 * of the entire level instead (possible when parent page is itself the
1915 * rightmost on its level).
1916 */
1917 targetnext = opaque->btpo_next;
1918 for (;;)
1919 {
1921
1922 rightpage = palloc_btree_page(state, targetnext);
1923 opaque = BTPageGetOpaque(rightpage);
1924
1925 if (!P_IGNORE(opaque) || P_RIGHTMOST(opaque))
1926 break;
1927
1928 /*
1929 * We landed on a deleted or half-dead sibling page. Step right until
1930 * we locate a live sibling page.
1931 */
1933 (errcode(ERRCODE_NO_DATA),
1934 errmsg_internal("level %u sibling page in block %u of index \"%s\" was found deleted or half dead",
1935 opaque->btpo_level, targetnext, RelationGetRelationName(state->rel)),
1936 errdetail_internal("Deleted page found when building scankey from right sibling.")));
1937
1938 targetnext = opaque->btpo_next;
1939
1940 /* Be slightly more pro-active in freeing this memory, just in case */
1941 pfree(rightpage);
1942 }
1943
1944 /*
1945 * No ShareLock held case -- why it's safe to proceed.
1946 *
1947 * Problem:
1948 *
1949 * We must avoid false positive reports of corruption when caller treats
1950 * item returned here as an upper bound on target's last item. In
1951 * general, false positives are disallowed. Avoiding them here when
1952 * caller is !readonly is subtle.
1953 *
1954 * A concurrent page deletion by VACUUM of the target page can result in
1955 * the insertion of items on to this right sibling page that would
1956 * previously have been inserted on our target page. There might have
1957 * been insertions that followed the target's downlink after it was made
1958 * to point to right sibling instead of target by page deletion's first
1959 * phase. The inserters insert items that would belong on target page.
1960 * This race is very tight, but it's possible. This is our only problem.
1961 *
1962 * Non-problems:
1963 *
1964 * We are not hindered by a concurrent page split of the target; we'll
1965 * never land on the second half of the page anyway. A concurrent split
1966 * of the right page will also not matter, because the first data item
1967 * remains the same within the left half, which we'll reliably land on. If
1968 * we had to skip over ignorable/deleted pages, it cannot matter because
1969 * their key space has already been atomically merged with the first
1970 * non-ignorable page we eventually find (doesn't matter whether the page
1971 * we eventually find is a true sibling or a cousin of target, which we go
1972 * into below).
1973 *
1974 * Solution:
1975 *
1976 * Caller knows that it should reverify that target is not ignorable
1977 * (half-dead or deleted) when cross-page sibling item comparison appears
1978 * to indicate corruption (invariant fails). This detects the single race
1979 * condition that exists for caller. This is correct because the
1980 * continued existence of target block as non-ignorable (not half-dead or
1981 * deleted) implies that target page was not merged into from the right by
1982 * deletion; the key space at or after target never moved left. Target's
1983 * parent either has the same downlink to target as before, or a <
1984 * downlink due to deletion at the left of target. Target either has the
1985 * same highkey as before, or a highkey < before when there is a page
1986 * split. (The rightmost concurrently-split-from-target-page page will
1987 * still have the same highkey as target was originally found to have,
1988 * which for our purposes is equivalent to target's highkey itself never
1989 * changing, since we reliably skip over
1990 * concurrently-split-from-target-page pages.)
1991 *
1992 * In simpler terms, we allow that the key space of the target may expand
1993 * left (the key space can move left on the left side of target only), but
1994 * the target key space cannot expand right and get ahead of us without
1995 * our detecting it. The key space of the target cannot shrink, unless it
1996 * shrinks to zero due to the deletion of the original page, our canary
1997 * condition. (To be very precise, we're a bit stricter than that because
1998 * it might just have been that the target page split and only the
1999 * original target page was deleted. We can be more strict, just not more
2000 * lax.)
2001 *
2002 * Top level tree walk caller moves on to next page (makes it the new
2003 * target) following recovery from this race. (cf. The rationale for
2004 * child/downlink verification needing a ShareLock within
2005 * bt_child_check(), where page deletion is also the main source of
2006 * trouble.)
2007 *
2008 * Note that it doesn't matter if right sibling page here is actually a
2009 * cousin page, because in order for the key space to be readjusted in a
2010 * way that causes us issues in next level up (guiding problematic
2011 * concurrent insertions to the cousin from the grandparent rather than to
2012 * the sibling from the parent), there'd have to be page deletion of
2013 * target's parent page (affecting target's parent's downlink in target's
2014 * grandparent page). Internal page deletion only occurs when there are
2015 * no child pages (they were all fully deleted), and caller is checking
2016 * that the target's parent has at least one non-deleted (so
2017 * non-ignorable) child: the target page. (Note that the first phase of
2018 * deletion atomically marks the page to be deleted half-dead/ignorable at
2019 * the same time downlink in its parent is removed, so caller will
2020 * definitely not fail to detect that this happened.)
2021 *
2022 * This trick is inspired by the method backward scans use for dealing
2023 * with concurrent page splits; concurrent page deletion is a problem that
2024 * similarly receives special consideration sometimes (it's possible that
2025 * the backwards scan will re-read its "original" block after failing to
2026 * find a right-link to it, having already moved in the opposite direction
2027 * (right/"forwards") a few times to try to locate one). Just like us,
2028 * that happens only to determine if there was a concurrent page deletion
2029 * of a reference page, and just like us if there was a page deletion of
2030 * that reference page it means we can move on from caring about the
2031 * reference page. See the nbtree README for a full description of how
2032 * that works.
2033 */
2034 nline = PageGetMaxOffsetNumber(rightpage);
2035
2036 /*
2037 * Get first data item, if any
2038 */
2039 if (P_ISLEAF(opaque) && nline >= P_FIRSTDATAKEY(opaque))
2040 {
2041 /* Return first data item (if any) */
2042 rightitem = PageGetItemIdCareful(state, targetnext, rightpage,
2043 P_FIRSTDATAKEY(opaque));
2044 *rightfirstoffset = P_FIRSTDATAKEY(opaque);
2045 }
2046 else if (!P_ISLEAF(opaque) &&
2047 nline >= OffsetNumberNext(P_FIRSTDATAKEY(opaque)))
2048 {
2049 /*
2050 * Return first item after the internal page's "negative infinity"
2051 * item
2052 */
2053 rightitem = PageGetItemIdCareful(state, targetnext, rightpage,
2055 }
2056 else
2057 {
2058 /*
2059 * No first item. Page is probably empty leaf page, but it's also
2060 * possible that it's an internal page with only a negative infinity
2061 * item.
2062 */
2064 (errcode(ERRCODE_NO_DATA),
2065 errmsg_internal("%s block %u of index \"%s\" has no first data item",
2066 P_ISLEAF(opaque) ? "leaf" : "internal", targetnext,
2068 return NULL;
2069 }
2070
2071 /*
2072 * Return first real item scankey. Note that this relies on right page
2073 * memory remaining allocated.
2074 */
2075 firstitup = (IndexTuple) PageGetItem(rightpage, rightitem);
2076 return bt_mkscankey_pivotsearch(state->rel, firstitup);
2077}
#define DEBUG2
Definition: elog.h:29
static BTScanInsert bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup)

References bt_mkscankey_pivotsearch(), BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, CHECK_FOR_INTERRUPTS, DEBUG2, ereport, errcode(), errdetail_internal(), errmsg_internal(), OffsetNumberNext, P_FIRSTDATAKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem(), PageGetItemIdCareful(), PageGetMaxOffsetNumber(), palloc_btree_page(), pfree(), and RelationGetRelationName.

Referenced by bt_target_page_check().

bt_rootdescend()

static bool bt_rootdescend ( BtreeCheckStatestate,
IndexTuple  itup 
)
static

Definition at line 3021 of file verify_nbtree.c.

3022{
3024 BTStack stack;
3025 Buffer lbuf;
3026 bool exists;
3027
3028 key = _bt_mkscankey(state->rel, itup);
3029 Assert(key->heapkeyspace && key->scantid != NULL);
3030
3031 /*
3032 * Search from root.
3033 *
3034 * Ideally, we would arrange to only move right within _bt_search() when
3035 * an interrupted page split is detected (i.e. when the incomplete split
3036 * bit is found to be set), but for now we accept the possibility that
3037 * that could conceal an inconsistency.
3038 */
3039 Assert(state->readonly && state->rootdescend);
3040 exists = false;
3041 stack = _bt_search(state->rel, NULL, key, &lbuf, BT_READ);
3042
3043 if (BufferIsValid(lbuf))
3044 {
3045 BTInsertStateData insertstate;
3046 OffsetNumber offnum;
3047 Page page;
3048
3049 insertstate.itup = itup;
3050 insertstate.itemsz = MAXALIGN(IndexTupleSize(itup));
3051 insertstate.itup_key = key;
3052 insertstate.postingoff = 0;
3053 insertstate.bounds_valid = false;
3054 insertstate.buf = lbuf;
3055
3056 /* Get matching tuple on leaf page */
3057 offnum = _bt_binsrch_insert(state->rel, &insertstate);
3058 /* Compare first >= matching item on leaf page, if any */
3059 page = BufferGetPage(lbuf);
3060 /* Should match on first heap TID when tuple has a posting list */
3061 if (offnum <= PageGetMaxOffsetNumber(page) &&
3062 insertstate.postingoff <= 0 &&
3063 _bt_compare(state->rel, key, page, offnum) == 0)
3064 exists = true;
3065 _bt_relbuf(state->rel, lbuf);
3066 }
3067
3068 _bt_freestack(stack);
3069 pfree(key);
3070
3071 return exists;
3072}
#define MAXALIGN(LEN)
Definition: c.h:810
void _bt_relbuf(Relation rel, Buffer buf)
Definition: nbtpage.c:1023
BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)
Definition: nbtsearch.c:107
OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
Definition: nbtsearch.c:479
int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum)
Definition: nbtsearch.c:693
void _bt_freestack(BTStack stack)
Definition: nbtutils.c:189
Size itemsz
Definition: nbtree.h:823
bool bounds_valid
Definition: nbtree.h:834
Buffer buf
Definition: nbtree.h:827
int postingoff
Definition: nbtree.h:843
IndexTuple itup
Definition: nbtree.h:822
BTScanInsert itup_key
Definition: nbtree.h:824

References _bt_binsrch_insert(), _bt_compare(), _bt_freestack(), _bt_mkscankey(), _bt_relbuf(), _bt_search(), Assert(), BTInsertStateData::bounds_valid, BT_READ, BTInsertStateData::buf, BufferGetPage(), BufferIsValid(), IndexTupleSize(), BTInsertStateData::itemsz, BTInsertStateData::itup, BTInsertStateData::itup_key, sort-test::key, MAXALIGN, PageGetMaxOffsetNumber(), pfree(), and BTInsertStateData::postingoff.

Referenced by bt_target_page_check().

bt_target_page_check()

static void bt_target_page_check ( BtreeCheckStatestate )
static

Definition at line 1251 of file verify_nbtree.c.

1252{
1253 OffsetNumber offset;
1254 OffsetNumber max;
1255 BTPageOpaque topaque;
1256
1257 /* Last visible entry info for checking indexes with unique constraint */
1259
1260 topaque = BTPageGetOpaque(state->target);
1261 max = PageGetMaxOffsetNumber(state->target);
1262
1263 elog(DEBUG2, "verifying %u items on %s block %u", max,
1264 P_ISLEAF(topaque) ? "leaf" : "internal", state->targetblock);
1265
1266 /*
1267 * Check the number of attributes in high key. Note, rightmost page
1268 * doesn't contain a high key, so nothing to check
1269 */
1270 if (!P_RIGHTMOST(topaque))
1271 {
1272 ItemId itemid;
1273 IndexTuple itup;
1274
1275 /* Verify line pointer before checking tuple */
1276 itemid = PageGetItemIdCareful(state, state->targetblock,
1277 state->target, P_HIKEY);
1278 if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
1279 P_HIKEY))
1280 {
1281 itup = (IndexTuple) PageGetItem(state->target, itemid);
1282 ereport(ERROR,
1283 (errcode(ERRCODE_INDEX_CORRUPTED),
1284 errmsg("wrong number of high key index tuple attributes in index \"%s\"",
1286 errdetail_internal("Index block=%u natts=%u block type=%s page lsn=%X/%08X.",
1287 state->targetblock,
1288 BTreeTupleGetNAtts(itup, state->rel),
1289 P_ISLEAF(topaque) ? "heap" : "index",
1290 LSN_FORMAT_ARGS(state->targetlsn))));
1291 }
1292 }
1293
1294 /*
1295 * Loop over page items, starting from first non-highkey item, not high
1296 * key (if any). Most tests are not performed for the "negative infinity"
1297 * real item (if any).
1298 */
1299 for (offset = P_FIRSTDATAKEY(topaque);
1300 offset <= max;
1301 offset = OffsetNumberNext(offset))
1302 {
1303 ItemId itemid;
1304 IndexTuple itup;
1305 size_t tupsize;
1306 BTScanInsert skey;
1307 bool lowersizelimit;
1308 ItemPointer scantid;
1309
1310 /*
1311 * True if we already called bt_entry_unique_check() for the current
1312 * item. This helps to avoid visiting the heap for keys, which are
1313 * anyway presented only once and can't comprise a unique violation.
1314 */
1315 bool unique_checked = false;
1316
1318
1319 itemid = PageGetItemIdCareful(state, state->targetblock,
1320 state->target, offset);
1321 itup = (IndexTuple) PageGetItem(state->target, itemid);
1322 tupsize = IndexTupleSize(itup);
1323
1324 /*
1325 * lp_len should match the IndexTuple reported length exactly, since
1326 * lp_len is completely redundant in indexes, and both sources of
1327 * tuple length are MAXALIGN()'d. nbtree does not use lp_len all that
1328 * frequently, and is surprisingly tolerant of corrupt lp_len fields.
1329 */
1330 if (tupsize != ItemIdGetLength(itemid))
1331 ereport(ERROR,
1332 (errcode(ERRCODE_INDEX_CORRUPTED),
1333 errmsg("index tuple size does not equal lp_len in index \"%s\"",
1335 errdetail_internal("Index tid=(%u,%u) tuple size=%zu lp_len=%u page lsn=%X/%08X.",
1336 state->targetblock, offset,
1337 tupsize, ItemIdGetLength(itemid),
1338 LSN_FORMAT_ARGS(state->targetlsn)),
1339 errhint("This could be a torn page problem.")));
1340
1341 /* Check the number of index tuple attributes */
1342 if (!_bt_check_natts(state->rel, state->heapkeyspace, state->target,
1343 offset))
1344 {
1345 ItemPointer tid;
1346 char *itid,
1347 *htid;
1348
1349 itid = psprintf("(%u,%u)", state->targetblock, offset);
1350 tid = BTreeTupleGetPointsToTID(itup);
1351 htid = psprintf("(%u,%u)",
1354
1355 ereport(ERROR,
1356 (errcode(ERRCODE_INDEX_CORRUPTED),
1357 errmsg("wrong number of index tuple attributes in index \"%s\"",
1359 errdetail_internal("Index tid=%s natts=%u points to %s tid=%s page lsn=%X/%08X.",
1360 itid,
1361 BTreeTupleGetNAtts(itup, state->rel),
1362 P_ISLEAF(topaque) ? "heap" : "index",
1363 htid,
1364 LSN_FORMAT_ARGS(state->targetlsn))));
1365 }
1366
1367 /*
1368 * Don't try to generate scankey using "negative infinity" item on
1369 * internal pages. They are always truncated to zero attributes.
1370 */
1371 if (offset_is_negative_infinity(topaque, offset))
1372 {
1373 /*
1374 * We don't call bt_child_check() for "negative infinity" items.
1375 * But if we're performing downlink connectivity check, we do it
1376 * for every item including "negative infinity" one.
1377 */
1378 if (!P_ISLEAF(topaque) && state->readonly)
1379 {
1381 offset,
1382 NULL,
1383 topaque->btpo_level);
1384 }
1385 continue;
1386 }
1387
1388 /*
1389 * Readonly callers may optionally verify that non-pivot tuples can
1390 * each be found by an independent search that starts from the root.
1391 * Note that we deliberately don't do individual searches for each
1392 * TID, since the posting list itself is validated by other checks.
1393 */
1394 if (state->rootdescend && P_ISLEAF(topaque) &&
1395 !bt_rootdescend(state, itup))
1396 {
1398 char *itid,
1399 *htid;
1400
1401 itid = psprintf("(%u,%u)", state->targetblock, offset);
1402 htid = psprintf("(%u,%u)", ItemPointerGetBlockNumber(tid),
1404
1405 ereport(ERROR,
1406 (errcode(ERRCODE_INDEX_CORRUPTED),
1407 errmsg("could not find tuple using search from root page in index \"%s\"",
1409 errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%08X.",
1410 itid, htid,
1411 LSN_FORMAT_ARGS(state->targetlsn))));
1412 }
1413
1414 /*
1415 * If tuple is a posting list tuple, make sure posting list TIDs are
1416 * in order
1417 */
1418 if (BTreeTupleIsPosting(itup))
1419 {
1420 ItemPointerData last;
1421 ItemPointer current;
1422
1424
1425 for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
1426 {
1427
1428 current = BTreeTupleGetPostingN(itup, i);
1429
1430 if (ItemPointerCompare(current, &last) <= 0)
1431 {
1432 char *itid = psprintf("(%u,%u)", state->targetblock, offset);
1433
1434 ereport(ERROR,
1435 (errcode(ERRCODE_INDEX_CORRUPTED),
1436 errmsg_internal("posting list contains misplaced TID in index \"%s\"",
1438 errdetail_internal("Index tid=%s posting list offset=%d page lsn=%X/%08X.",
1439 itid, i,
1440 LSN_FORMAT_ARGS(state->targetlsn))));
1441 }
1442
1443 ItemPointerCopy(current, &last);
1444 }
1445 }
1446
1447 /* Build insertion scankey for current page offset */
1448 skey = bt_mkscankey_pivotsearch(state->rel, itup);
1449
1450 /*
1451 * Make sure tuple size does not exceed the relevant BTREE_VERSION
1452 * specific limit.
1453 *
1454 * BTREE_VERSION 4 (which introduced heapkeyspace rules) requisitioned
1455 * a small amount of space from BTMaxItemSize() in order to ensure
1456 * that suffix truncation always has enough space to add an explicit
1457 * heap TID back to a tuple -- we pessimistically assume that every
1458 * newly inserted tuple will eventually need to have a heap TID
1459 * appended during a future leaf page split, when the tuple becomes
1460 * the basis of the new high key (pivot tuple) for the leaf page.
1461 *
1462 * Since the reclaimed space is reserved for that purpose, we must not
1463 * enforce the slightly lower limit when the extra space has been used
1464 * as intended. In other words, there is only a cross-version
1465 * difference in the limit on tuple size within leaf pages.
1466 *
1467 * Still, we're particular about the details within BTREE_VERSION 4
1468 * internal pages. Pivot tuples may only use the extra space for its
1469 * designated purpose. Enforce the lower limit for pivot tuples when
1470 * an explicit heap TID isn't actually present. (In all other cases
1471 * suffix truncation is guaranteed to generate a pivot tuple that's no
1472 * larger than the firstright tuple provided to it by its caller.)
1473 */
1474 lowersizelimit = skey->heapkeyspace &&
1475 (P_ISLEAF(topaque) || BTreeTupleGetHeapTID(itup) == NULL);
1476 if (tupsize > (lowersizelimit ? BTMaxItemSize : BTMaxItemSizeNoHeapTid))
1477 {
1479 char *itid,
1480 *htid;
1481
1482 itid = psprintf("(%u,%u)", state->targetblock, offset);
1483 htid = psprintf("(%u,%u)",
1486
1487 ereport(ERROR,
1488 (errcode(ERRCODE_INDEX_CORRUPTED),
1489 errmsg("index row size %zu exceeds maximum for index \"%s\"",
1490 tupsize, RelationGetRelationName(state->rel)),
1491 errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%08X.",
1492 itid,
1493 P_ISLEAF(topaque) ? "heap" : "index",
1494 htid,
1495 LSN_FORMAT_ARGS(state->targetlsn))));
1496 }
1497
1498 /* Fingerprint leaf page tuples (those that point to the heap) */
1499 if (state->heapallindexed && P_ISLEAF(topaque) && !ItemIdIsDead(itemid))
1500 {
1501 IndexTuple norm;
1502
1503 if (BTreeTupleIsPosting(itup))
1504 {
1505 /* Fingerprint all elements as distinct "plain" tuples */
1506 for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
1507 {
1508 IndexTuple logtuple;
1509
1510 logtuple = bt_posting_plain_tuple(itup, i);
1511 norm = bt_normalize_tuple(state, logtuple);
1512 bloom_add_element(state->filter, (unsigned char *) norm,
1513 IndexTupleSize(norm));
1514 /* Be tidy */
1515 if (norm != logtuple)
1516 pfree(norm);
1517 pfree(logtuple);
1518 }
1519 }
1520 else
1521 {
1522 norm = bt_normalize_tuple(state, itup);
1523 bloom_add_element(state->filter, (unsigned char *) norm,
1524 IndexTupleSize(norm));
1525 /* Be tidy */
1526 if (norm != itup)
1527 pfree(norm);
1528 }
1529 }
1530
1531 /*
1532 * * High key check *
1533 *
1534 * If there is a high key (if this is not the rightmost page on its
1535 * entire level), check that high key actually is upper bound on all
1536 * page items. If this is a posting list tuple, we'll need to set
1537 * scantid to be highest TID in posting list.
1538 *
1539 * We prefer to check all items against high key rather than checking
1540 * just the last and trusting that the operator class obeys the
1541 * transitive law (which implies that all previous items also
1542 * respected the high key invariant if they pass the item order
1543 * check).
1544 *
1545 * Ideally, we'd compare every item in the index against every other
1546 * item in the index, and not trust opclass obedience of the
1547 * transitive law to bridge the gap between children and their
1548 * grandparents (as well as great-grandparents, and so on). We don't
1549 * go to those lengths because that would be prohibitively expensive,
1550 * and probably not markedly more effective in practice.
1551 *
1552 * On the leaf level, we check that the key is <= the highkey.
1553 * However, on non-leaf levels we check that the key is < the highkey,
1554 * because the high key is "just another separator" rather than a copy
1555 * of some existing key item; we expect it to be unique among all keys
1556 * on the same level. (Suffix truncation will sometimes produce a
1557 * leaf highkey that is an untruncated copy of the lastleft item, but
1558 * never any other item, which necessitates weakening the leaf level
1559 * check to <=.)
1560 *
1561 * Full explanation for why a highkey is never truly a copy of another
1562 * item from the same level on internal levels:
1563 *
1564 * While the new left page's high key is copied from the first offset
1565 * on the right page during an internal page split, that's not the
1566 * full story. In effect, internal pages are split in the middle of
1567 * the firstright tuple, not between the would-be lastleft and
1568 * firstright tuples: the firstright key ends up on the left side as
1569 * left's new highkey, and the firstright downlink ends up on the
1570 * right side as right's new "negative infinity" item. The negative
1571 * infinity tuple is truncated to zero attributes, so we're only left
1572 * with the downlink. In other words, the copying is just an
1573 * implementation detail of splitting in the middle of a (pivot)
1574 * tuple. (See also: "Notes About Data Representation" in the nbtree
1575 * README.)
1576 */
1577 scantid = skey->scantid;
1578 if (state->heapkeyspace && BTreeTupleIsPosting(itup))
1579 skey->scantid = BTreeTupleGetMaxHeapTID(itup);
1580
1581 if (!P_RIGHTMOST(topaque) &&
1582 !(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
1584 {
1586 char *itid,
1587 *htid;
1588
1589 itid = psprintf("(%u,%u)", state->targetblock, offset);
1590 htid = psprintf("(%u,%u)",
1593
1594 ereport(ERROR,
1595 (errcode(ERRCODE_INDEX_CORRUPTED),
1596 errmsg("high key invariant violated for index \"%s\"",
1598 errdetail_internal("Index tid=%s points to %s tid=%s page lsn=%X/%08X.",
1599 itid,
1600 P_ISLEAF(topaque) ? "heap" : "index",
1601 htid,
1602 LSN_FORMAT_ARGS(state->targetlsn))));
1603 }
1604 /* Reset, in case scantid was set to (itup) posting tuple's max TID */
1605 skey->scantid = scantid;
1606
1607 /*
1608 * * Item order check *
1609 *
1610 * Check that items are stored on page in logical order, by checking
1611 * current item is strictly less than next item (if any).
1612 */
1613 if (OffsetNumberNext(offset) <= max &&
1614 !invariant_l_offset(state, skey, OffsetNumberNext(offset)))
1615 {
1616 ItemPointer tid;
1617 char *itid,
1618 *htid,
1619 *nitid,
1620 *nhtid;
1621
1622 itid = psprintf("(%u,%u)", state->targetblock, offset);
1623 tid = BTreeTupleGetPointsToTID(itup);
1624 htid = psprintf("(%u,%u)",
1627 nitid = psprintf("(%u,%u)", state->targetblock,
1628 OffsetNumberNext(offset));
1629
1630 /* Reuse itup to get pointed-to heap location of second item */
1631 itemid = PageGetItemIdCareful(state, state->targetblock,
1632 state->target,
1633 OffsetNumberNext(offset));
1634 itup = (IndexTuple) PageGetItem(state->target, itemid);
1635 tid = BTreeTupleGetPointsToTID(itup);
1636 nhtid = psprintf("(%u,%u)",
1639
1640 ereport(ERROR,
1641 (errcode(ERRCODE_INDEX_CORRUPTED),
1642 errmsg("item order invariant violated for index \"%s\"",
1644 errdetail_internal("Lower index tid=%s (points to %s tid=%s) higher index tid=%s (points to %s tid=%s) page lsn=%X/%08X.",
1645 itid,
1646 P_ISLEAF(topaque) ? "heap" : "index",
1647 htid,
1648 nitid,
1649 P_ISLEAF(topaque) ? "heap" : "index",
1650 nhtid,
1651 LSN_FORMAT_ARGS(state->targetlsn))));
1652 }
1653
1654 /*
1655 * If the index is unique verify entries uniqueness by checking the
1656 * heap tuples visibility. Immediately check posting tuples and
1657 * tuples with repeated keys. Postpone check for keys, which have the
1658 * first appearance.
1659 */
1660 if (state->checkunique && state->indexinfo->ii_Unique &&
1661 P_ISLEAF(topaque) && !skey->anynullkeys &&
1663 {
1664 bt_entry_unique_check(state, itup, state->targetblock, offset,
1665 &lVis);
1666 unique_checked = true;
1667 }
1668
1669 if (state->checkunique && state->indexinfo->ii_Unique &&
1670 P_ISLEAF(topaque) && OffsetNumberNext(offset) <= max)
1671 {
1672 /* Save current scankey tid */
1673 scantid = skey->scantid;
1674
1675 /*
1676 * Invalidate scankey tid to make _bt_compare compare only keys in
1677 * the item to report equality even if heap TIDs are different
1678 */
1679 skey->scantid = NULL;
1680
1681 /*
1682 * If next key tuple is different, invalidate last visible entry
1683 * data (whole index tuple or last posting in index tuple). Key
1684 * containing null value does not violate unique constraint and
1685 * treated as different to any other key.
1686 *
1687 * If the next key is the same as the previous one, do the
1688 * bt_entry_unique_check() call if it was postponed.
1689 */
1690 if (_bt_compare(state->rel, skey, state->target,
1691 OffsetNumberNext(offset)) != 0 || skey->anynullkeys)
1692 {
1695 lVis.postingIndex = -1;
1696 lVis.tid = NULL;
1697 }
1698 else if (!unique_checked)
1699 {
1700 bt_entry_unique_check(state, itup, state->targetblock, offset,
1701 &lVis);
1702 }
1703 skey->scantid = scantid; /* Restore saved scan key state */
1704 }
1705
1706 /*
1707 * * Last item check *
1708 *
1709 * Check last item against next/right page's first data item's when
1710 * last item on page is reached. This additional check will detect
1711 * transposed pages iff the supposed right sibling page happens to
1712 * belong before target in the key space. (Otherwise, a subsequent
1713 * heap verification will probably detect the problem.)
1714 *
1715 * This check is similar to the item order check that will have
1716 * already been performed for every other "real" item on target page
1717 * when last item is checked. The difference is that the next item
1718 * (the item that is compared to target's last item) needs to come
1719 * from the next/sibling page. There may not be such an item
1720 * available from sibling for various reasons, though (e.g., target is
1721 * the rightmost page on level).
1722 */
1723 if (offset == max)
1724 {
1725 BTScanInsert rightkey;
1726
1727 /* first offset on a right index page (log only) */
1728 OffsetNumber rightfirstoffset = InvalidOffsetNumber;
1729
1730 /* Get item in next/right page */
1731 rightkey = bt_right_page_check_scankey(state, &rightfirstoffset);
1732
1733 if (rightkey &&
1734 !invariant_g_offset(state, rightkey, max))
1735 {
1736 /*
1737 * As explained at length in bt_right_page_check_scankey(),
1738 * there is a known !readonly race that could account for
1739 * apparent violation of invariant, which we must check for
1740 * before actually proceeding with raising error. Our canary
1741 * condition is that target page was deleted.
1742 */
1743 if (!state->readonly)
1744 {
1745 /* Get fresh copy of target page */
1746 state->target = palloc_btree_page(state, state->targetblock);
1747 /* Note that we deliberately do not update target LSN */
1748 topaque = BTPageGetOpaque(state->target);
1749
1750 /*
1751 * All !readonly checks now performed; just return
1752 */
1753 if (P_IGNORE(topaque))
1754 return;
1755 }
1756
1757 ereport(ERROR,
1758 (errcode(ERRCODE_INDEX_CORRUPTED),
1759 errmsg("cross page item order invariant violated for index \"%s\"",
1761 errdetail_internal("Last item on page tid=(%u,%u) page lsn=%X/%08X.",
1762 state->targetblock, offset,
1763 LSN_FORMAT_ARGS(state->targetlsn))));
1764 }
1765
1766 /*
1767 * If index has unique constraint make sure that no more than one
1768 * found equal items is visible.
1769 */
1770 if (state->checkunique && state->indexinfo->ii_Unique &&
1771 rightkey && P_ISLEAF(topaque) && !P_RIGHTMOST(topaque))
1772 {
1773 BlockNumber rightblock_number = topaque->btpo_next;
1774
1775 elog(DEBUG2, "check cross page unique condition");
1776
1777 /*
1778 * Make _bt_compare compare only index keys without heap TIDs.
1779 * rightkey->scantid is modified destructively but it is ok
1780 * for it is not used later.
1781 */
1782 rightkey->scantid = NULL;
1783
1784 /* The first key on the next page is the same */
1785 if (_bt_compare(state->rel, rightkey, state->target, max) == 0 &&
1786 !rightkey->anynullkeys)
1787 {
1788 Page rightpage;
1789
1790 /*
1791 * Do the bt_entry_unique_check() call if it was
1792 * postponed.
1793 */
1794 if (!unique_checked)
1795 bt_entry_unique_check(state, itup, state->targetblock,
1796 offset, &lVis);
1797
1798 elog(DEBUG2, "cross page equal keys");
1799 rightpage = palloc_btree_page(state,
1800 rightblock_number);
1801 topaque = BTPageGetOpaque(rightpage);
1802
1803 if (P_IGNORE(topaque))
1804 {
1805 pfree(rightpage);
1806 break;
1807 }
1808
1809 if (unlikely(!P_ISLEAF(topaque)))
1810 ereport(ERROR,
1811 (errcode(ERRCODE_INDEX_CORRUPTED),
1812 errmsg("right block of leaf block is non-leaf for index \"%s\"",
1814 errdetail_internal("Block=%u page lsn=%X/%08X.",
1815 state->targetblock,
1816 LSN_FORMAT_ARGS(state->targetlsn))));
1817
1818 itemid = PageGetItemIdCareful(state, rightblock_number,
1819 rightpage,
1820 rightfirstoffset);
1821 itup = (IndexTuple) PageGetItem(rightpage, itemid);
1822
1823 bt_entry_unique_check(state, itup, rightblock_number, rightfirstoffset, &lVis);
1824
1825 pfree(rightpage);
1826 }
1827 }
1828 }
1829
1830 /*
1831 * * Downlink check *
1832 *
1833 * Additional check of child items iff this is an internal page and
1834 * caller holds a ShareLock. This happens for every downlink (item)
1835 * in target excluding the negative-infinity downlink (again, this is
1836 * because it has no useful value to compare).
1837 */
1838 if (!P_ISLEAF(topaque) && state->readonly)
1839 bt_child_check(state, skey, offset);
1840 }
1841
1842 /*
1843 * Special case bt_child_highkey_check() call
1844 *
1845 * We don't pass a real downlink, but we've to finish the level
1846 * processing. If condition is satisfied, we've already processed all the
1847 * downlinks from the target level. But there still might be pages to the
1848 * right of the child page pointer to by our rightmost downlink. And they
1849 * might have missing downlinks. This final call checks for them.
1850 */
1851 if (!P_ISLEAF(topaque) && P_RIGHTMOST(topaque) && state->readonly)
1852 {
1854 NULL, topaque->btpo_level);
1855 }
1856}
void bloom_add_element(bloom_filter *filter, unsigned char *elem, size_t len)
Definition: bloomfilter.c:135
#define unlikely(x)
Definition: c.h:402
#define ItemIdGetLength(itemId)
Definition: itemid.h:59
#define ItemIdIsDead(itemId)
Definition: itemid.h:113
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
Definition: itemptr.c:51
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition: itemptr.h:172
static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)
Definition: nbtree.h:665
#define BTMaxItemSizeNoHeapTid
Definition: nbtree.h:170
#define BTMaxItemSize
Definition: nbtree.h:165
#define BTreeTupleGetNAtts(itup, rel)
Definition: nbtree.h:578
bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
Definition: nbtutils.c:4110
#define InvalidOffsetNumber
Definition: off.h:26
ItemPointer scantid
Definition: nbtree.h:802
bool heapkeyspace
Definition: nbtree.h:797
bool anynullkeys
Definition: nbtree.h:799
static bool invariant_l_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound)
static ItemPointer BTreeTupleGetPointsToTID(IndexTuple itup)
static IndexTuple bt_posting_plain_tuple(IndexTuple itup, int n)
static bool invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber upperbound)
static bool bt_rootdescend(BtreeCheckState *state, IndexTuple itup)
static BTScanInsert bt_right_page_check_scankey(BtreeCheckState *state, OffsetNumber *rightfirstoffset)
static bool invariant_g_offset(BtreeCheckState *state, BTScanInsert key, OffsetNumber lowerbound)
static IndexTuple bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
static void bt_child_check(BtreeCheckState *state, BTScanInsert targetkey, OffsetNumber downlinkoffnum)
static void bt_entry_unique_check(BtreeCheckState *state, IndexTuple itup, BlockNumber targetblock, OffsetNumber offset, BtreeLastVisibleEntry *lVis)
Definition: verify_nbtree.c:923

References _bt_check_natts(), _bt_compare(), BTScanInsertData::anynullkeys, BtreeLastVisibleEntry::blkno, bloom_add_element(), bt_child_check(), bt_child_highkey_check(), bt_entry_unique_check(), bt_mkscankey_pivotsearch(), bt_normalize_tuple(), bt_posting_plain_tuple(), bt_right_page_check_scankey(), bt_rootdescend(), BTMaxItemSize, BTMaxItemSizeNoHeapTid, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTPageOpaqueData::btpo_next, BTreeTupleGetHeapTID(), BTreeTupleGetMaxHeapTID(), BTreeTupleGetNAtts, BTreeTupleGetNPosting(), BTreeTupleGetPointsToTID(), BTreeTupleGetPostingN(), BTreeTupleIsPosting(), CHECK_FOR_INTERRUPTS, DEBUG2, elog, ereport, errcode(), errdetail_internal(), errhint(), errmsg(), errmsg_internal(), ERROR, BTScanInsertData::heapkeyspace, i, IndexTupleSize(), InvalidBlockNumber, InvalidOffsetNumber, invariant_g_offset(), invariant_l_offset(), invariant_leq_offset(), ItemIdGetLength, ItemIdIsDead, ItemPointerCompare(), ItemPointerCopy(), ItemPointerGetBlockNumber(), ItemPointerGetBlockNumberNoCheck(), ItemPointerGetOffsetNumber(), ItemPointerGetOffsetNumberNoCheck(), ItemPointerIsValid(), LSN_FORMAT_ARGS, BtreeLastVisibleEntry::offset, offset_is_negative_infinity(), OffsetNumberNext, P_FIRSTDATAKEY, P_HIKEY, P_IGNORE, P_ISLEAF, P_RIGHTMOST, PageGetItem(), PageGetItemIdCareful(), PageGetMaxOffsetNumber(), palloc_btree_page(), pfree(), BtreeLastVisibleEntry::postingIndex, psprintf(), RelationGetRelationName, BTScanInsertData::scantid, BtreeLastVisibleEntry::tid, and unlikely.

Referenced by bt_check_level_from_leftmost().

bt_tuple_present_callback()

static void bt_tuple_present_callback ( Relation  index,
ItemPointer  tid,
Datumvalues,
bool *  isnull,
bool  tupleIsAlive,
void *  checkstate 
)
static

Definition at line 2792 of file verify_nbtree.c.

2794{
2795 BtreeCheckState *state = (BtreeCheckState *) checkstate;
2796 IndexTuple itup,
2797 norm;
2798
2799 Assert(state->heapallindexed);
2800
2801 /* Generate a normalized index tuple for fingerprinting */
2803 itup->t_tid = *tid;
2804 norm = bt_normalize_tuple(state, itup);
2805
2806 /* Probe Bloom filter -- tuple should be present */
2807 if (bloom_lacks_element(state->filter, (unsigned char *) norm,
2808 IndexTupleSize(norm)))
2809 ereport(ERROR,
2811 errmsg("heap tuple (%u,%u) from table \"%s\" lacks matching index tuple within index \"%s\"",
2816 !state->readonly
2817 ? errhint("Retrying verification using the function bt_index_parent_check() might provide a more specific error.")
2818 : 0));
2819
2820 state->heaptuplespresent++;
2821 pfree(itup);
2822 /* Cannot leak memory here */
2823 if (norm != itup)
2824 pfree(norm);
2825}
bool bloom_lacks_element(bloom_filter *filter, unsigned char *elem, size_t len)
Definition: bloomfilter.c:157
static Datum values[MAXATTR]
Definition: bootstrap.c:153
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
Definition: type.h:96

References Assert(), bloom_lacks_element(), bt_normalize_tuple(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errhint(), errmsg(), ERROR, index_form_tuple(), IndexTupleSize(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), pfree(), RelationGetDescr, RelationGetRelationName, IndexTupleData::t_tid, and values.

Referenced by bt_check_every_level().

BTreeTupleGetHeapTIDCareful()

static ItemPointer BTreeTupleGetHeapTIDCareful ( BtreeCheckStatestate,
IndexTuple  itup,
bool  nonpivot 
)
inlinestatic

Definition at line 3544 of file verify_nbtree.c.

3546{
3547 ItemPointer htid;
3548
3549 /*
3550 * Caller determines whether this is supposed to be a pivot or non-pivot
3551 * tuple using page type and item offset number. Verify that tuple
3552 * metadata agrees with this.
3553 */
3554 Assert(state->heapkeyspace);
3555 if (BTreeTupleIsPivot(itup) && nonpivot)
3556 ereport(ERROR,
3557 (errcode(ERRCODE_INDEX_CORRUPTED),
3558 errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected pivot tuple",
3559 state->targetblock,
3561
3562 if (!BTreeTupleIsPivot(itup) && !nonpivot)
3563 ereport(ERROR,
3564 (errcode(ERRCODE_INDEX_CORRUPTED),
3565 errmsg_internal("block %u or its right sibling block or child block in index \"%s\" has unexpected non-pivot tuple",
3566 state->targetblock,
3568
3569 htid = BTreeTupleGetHeapTID(itup);
3570 if (!ItemPointerIsValid(htid) && nonpivot)
3571 ereport(ERROR,
3572 (errcode(ERRCODE_INDEX_CORRUPTED),
3573 errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
3574 state->targetblock,
3576
3577 return htid;
3578}

References Assert(), BTreeTupleGetHeapTID(), BTreeTupleIsPivot(), ereport, errcode(), errmsg(), errmsg_internal(), ERROR, ItemPointerIsValid(), and RelationGetRelationName.

Referenced by invariant_l_nontarget_offset(), and invariant_l_offset().

BTreeTupleGetPointsToTID()

static ItemPointer BTreeTupleGetPointsToTID ( IndexTuple  itup )
inlinestatic

Definition at line 3592 of file verify_nbtree.c.

3593{
3594 /*
3595 * Rely on the assumption that !heapkeyspace internal page data items will
3596 * correctly return TID with downlink here -- BTreeTupleGetHeapTID() won't
3597 * recognize it as a pivot tuple, but everything still works out because
3598 * the t_tid field is still returned
3599 */
3600 if (!BTreeTupleIsPivot(itup))
3601 return BTreeTupleGetHeapTID(itup);
3602
3603 /* Pivot tuple returns TID with downlink block (heapkeyspace variant) */
3604 return &itup->t_tid;
3605}

References BTreeTupleGetHeapTID(), BTreeTupleIsPivot(), and IndexTupleData::t_tid.

Referenced by bt_target_page_check().

heap_entry_is_visible()

static bool heap_entry_is_visible ( BtreeCheckStatestate,
ItemPointer  tid 
)
static

Definition at line 864 of file verify_nbtree.c.

865{
866 bool tid_visible;
867
868 TupleTableSlot *slot = table_slot_create(state->heaprel, NULL);
869
870 tid_visible = table_tuple_fetch_row_version(state->heaprel,
871 tid, state->snapshot, slot);
872 if (slot != NULL)
874
875 return tid_visible;
876}
void ExecDropSingleTupleTableSlot(TupleTableSlot *slot)
Definition: execTuples.c:1443
TupleTableSlot * table_slot_create(Relation relation, List **reglist)
Definition: tableam.c:92
static bool table_tuple_fetch_row_version(Relation rel, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot)
Definition: tableam.h:1253

References ExecDropSingleTupleTableSlot(), table_slot_create(), and table_tuple_fetch_row_version().

Referenced by bt_entry_unique_check().

invariant_g_offset()

static bool invariant_g_offset ( BtreeCheckStatestate,
BTScanInsert  key,
OffsetNumber  lowerbound 
)
inlinestatic

Definition at line 3206 of file verify_nbtree.c.

3208{
3209 int32 cmp;
3210
3211 Assert(!key->nextkey && key->backward);
3212
3213 cmp = _bt_compare(state->rel, key, state->target, lowerbound);
3214
3215 /* pg_upgrade'd indexes may legally have equal sibling tuples */
3216 if (!key->heapkeyspace)
3217 return cmp >= 0;
3218
3219 /*
3220 * No need to consider the possibility that scankey has attributes that we
3221 * need to force to be interpreted as negative infinity. _bt_compare() is
3222 * able to determine that scankey is greater than negative infinity. The
3223 * distinction between "==" and "<" isn't interesting here, since
3224 * corruption is indicated either way.
3225 */
3226 return cmp > 0;
3227}
int32_t int32
Definition: c.h:534
static int cmp(const chr *x, const chr *y, size_t len)
Definition: regc_locale.c:743

References _bt_compare(), Assert(), cmp(), and sort-test::key.

Referenced by bt_target_page_check().

invariant_l_nontarget_offset()

static bool invariant_l_nontarget_offset ( BtreeCheckStatestate,
BTScanInsert  key,
BlockNumber  nontargetblock,
Page  nontarget,
OffsetNumber  upperbound 
)
inlinestatic

Definition at line 3242 of file verify_nbtree.c.

3245{
3246 ItemId itemid;
3247 int32 cmp;
3248
3249 Assert(!key->nextkey && key->backward);
3250
3251 /* Verify line pointer before checking tuple */
3252 itemid = PageGetItemIdCareful(state, nontargetblock, nontarget,
3253 upperbound);
3254 cmp = _bt_compare(state->rel, key, nontarget, upperbound);
3255
3256 /* pg_upgrade'd indexes may legally have equal sibling tuples */
3257 if (!key->heapkeyspace)
3258 return cmp <= 0;
3259
3260 /* See invariant_l_offset() for an explanation of this extra step */
3261 if (cmp == 0)
3262 {
3263 IndexTuple child;
3264 int uppnkeyatts;
3265 ItemPointer childheaptid;
3266 BTPageOpaque copaque;
3267 bool nonpivot;
3268
3269 child = (IndexTuple) PageGetItem(nontarget, itemid);
3270 copaque = BTPageGetOpaque(nontarget);
3271 nonpivot = P_ISLEAF(copaque) && upperbound >= P_FIRSTDATAKEY(copaque);
3272
3273 /* Get number of keys + heap TID for child/non-target item */
3274 uppnkeyatts = BTreeTupleGetNKeyAtts(child, state->rel);
3275 childheaptid = BTreeTupleGetHeapTIDCareful(state, child, nonpivot);
3276
3277 /* Heap TID is tiebreaker key attribute */
3278 if (key->keysz == uppnkeyatts)
3279 return key->scantid == NULL && childheaptid != NULL;
3280
3281 return key->keysz < uppnkeyatts;
3282 }
3283
3284 return cmp < 0;
3285}
static ItemPointer BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup, bool nonpivot)
#define BTreeTupleGetNKeyAtts(itup, rel)
Definition: verify_nbtree.c:56

References _bt_compare(), Assert(), BTPageGetOpaque, BTreeTupleGetHeapTIDCareful(), BTreeTupleGetNKeyAtts, cmp(), sort-test::key, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), and PageGetItemIdCareful().

Referenced by bt_child_check().

invariant_l_offset()

static bool invariant_l_offset ( BtreeCheckStatestate,
BTScanInsert  key,
OffsetNumber  upperbound 
)
inlinestatic

Definition at line 3120 of file verify_nbtree.c.

3122{
3123 ItemId itemid;
3124 int32 cmp;
3125
3126 Assert(!key->nextkey && key->backward);
3127
3128 /* Verify line pointer before checking tuple */
3129 itemid = PageGetItemIdCareful(state, state->targetblock, state->target,
3130 upperbound);
3131 /* pg_upgrade'd indexes may legally have equal sibling tuples */
3132 if (!key->heapkeyspace)
3133 return invariant_leq_offset(state, key, upperbound);
3134
3135 cmp = _bt_compare(state->rel, key, state->target, upperbound);
3136
3137 /*
3138 * _bt_compare() is capable of determining that a scankey with a
3139 * filled-out attribute is greater than pivot tuples where the comparison
3140 * is resolved at a truncated attribute (value of attribute in pivot is
3141 * minus infinity). However, it is not capable of determining that a
3142 * scankey is _less than_ a tuple on the basis of a comparison resolved at
3143 * _scankey_ minus infinity attribute. Complete an extra step to simulate
3144 * having minus infinity values for omitted scankey attribute(s).
3145 */
3146 if (cmp == 0)
3147 {
3148 BTPageOpaque topaque;
3149 IndexTuple ritup;
3150 int uppnkeyatts;
3151 ItemPointer rheaptid;
3152 bool nonpivot;
3153
3154 ritup = (IndexTuple) PageGetItem(state->target, itemid);
3155 topaque = BTPageGetOpaque(state->target);
3156 nonpivot = P_ISLEAF(topaque) && upperbound >= P_FIRSTDATAKEY(topaque);
3157
3158 /* Get number of keys + heap TID for item to the right */
3159 uppnkeyatts = BTreeTupleGetNKeyAtts(ritup, state->rel);
3160 rheaptid = BTreeTupleGetHeapTIDCareful(state, ritup, nonpivot);
3161
3162 /* Heap TID is tiebreaker key attribute */
3163 if (key->keysz == uppnkeyatts)
3164 return key->scantid == NULL && rheaptid != NULL;
3165
3166 return key->keysz < uppnkeyatts;
3167 }
3168
3169 return cmp < 0;
3170}

References _bt_compare(), Assert(), BTPageGetOpaque, BTreeTupleGetHeapTIDCareful(), BTreeTupleGetNKeyAtts, cmp(), invariant_leq_offset(), sort-test::key, P_FIRSTDATAKEY, P_ISLEAF, PageGetItem(), and PageGetItemIdCareful().

Referenced by bt_target_page_check().

invariant_leq_offset()

static bool invariant_leq_offset ( BtreeCheckStatestate,
BTScanInsert  key,
OffsetNumber  upperbound 
)
inlinestatic

Definition at line 3183 of file verify_nbtree.c.

3185{
3186 int32 cmp;
3187
3188 Assert(!key->nextkey && key->backward);
3189
3190 cmp = _bt_compare(state->rel, key, state->target, upperbound);
3191
3192 return cmp <= 0;
3193}

References _bt_compare(), Assert(), cmp(), and sort-test::key.

Referenced by bt_target_page_check(), and invariant_l_offset().

offset_is_negative_infinity()

static bool offset_is_negative_infinity ( BTPageOpaque  opaque,
OffsetNumber  offset 
)
inlinestatic

Definition at line 3085 of file verify_nbtree.c.

3086{
3087 /*
3088 * For internal pages only, the first item after high key, if any, is
3089 * negative infinity item. Internal pages always have a negative infinity
3090 * item, whereas leaf pages never have one. This implies that negative
3091 * infinity item is either first or second line item, or there is none
3092 * within page.
3093 *
3094 * Negative infinity items are a special case among pivot tuples. They
3095 * always have zero attributes, while all other pivot tuples always have
3096 * nkeyatts attributes.
3097 *
3098 * Right-most pages don't have a high key, but could be said to
3099 * conceptually have a "positive infinity" high key. Thus, there is a
3100 * symmetry between down link items in parent pages, and high keys in
3101 * children. Together, they represent the part of the key space that
3102 * belongs to each page in the index. For example, all children of the
3103 * root page will have negative infinity as a lower bound from root
3104 * negative infinity downlink, and positive infinity as an upper bound
3105 * (implicitly, from "imaginary" positive infinity high key in root).
3106 */
3107 return !P_ISLEAF(opaque) && offset == P_FIRSTDATAKEY(opaque);
3108}

References P_FIRSTDATAKEY, and P_ISLEAF.

Referenced by bt_child_check(), bt_child_highkey_check(), and bt_target_page_check().

PageGetItemIdCareful()

static ItemId PageGetItemIdCareful ( BtreeCheckStatestate,
BlockNumber  block,
Page  page,
OffsetNumber  offset 
)
static

Definition at line 3504 of file verify_nbtree.c.

3506{
3507 ItemId itemid = PageGetItemId(page, offset);
3508
3509 if (ItemIdGetOffset(itemid) + ItemIdGetLength(itemid) >
3510 BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData)))
3511 ereport(ERROR,
3512 (errcode(ERRCODE_INDEX_CORRUPTED),
3513 errmsg("line pointer points past end of tuple space in index \"%s\"",
3515 errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.",
3516 block, offset, ItemIdGetOffset(itemid),
3517 ItemIdGetLength(itemid),
3518 ItemIdGetFlags(itemid))));
3519
3520 /*
3521 * Verify that line pointer isn't LP_REDIRECT or LP_UNUSED, since nbtree
3522 * never uses either. Verify that line pointer has storage, too, since
3523 * even LP_DEAD items should within nbtree.
3524 */
3525 if (ItemIdIsRedirected(itemid) || !ItemIdIsUsed(itemid) ||
3526 ItemIdGetLength(itemid) == 0)
3527 ereport(ERROR,
3528 (errcode(ERRCODE_INDEX_CORRUPTED),
3529 errmsg("invalid line pointer storage in index \"%s\"",
3531 errdetail_internal("Index tid=(%u,%u) lp_off=%u, lp_len=%u lp_flags=%u.",
3532 block, offset, ItemIdGetOffset(itemid),
3533 ItemIdGetLength(itemid),
3534 ItemIdGetFlags(itemid))));
3535
3536 return itemid;
3537}
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition: bufpage.h:244
#define ItemIdGetOffset(itemId)
Definition: itemid.h:65
#define ItemIdIsUsed(itemId)
Definition: itemid.h:92
#define ItemIdIsRedirected(itemId)
Definition: itemid.h:106
#define ItemIdGetFlags(itemId)
Definition: itemid.h:71

References ereport, errcode(), errdetail_internal(), errmsg(), ERROR, ItemIdGetFlags, ItemIdGetLength, ItemIdGetOffset, ItemIdIsRedirected, ItemIdIsUsed, MAXALIGN, PageGetItemId(), and RelationGetRelationName.

Referenced by bt_check_level_from_leftmost(), bt_child_check(), bt_child_highkey_check(), bt_downlink_missing_check(), bt_right_page_check_scankey(), bt_target_page_check(), invariant_l_nontarget_offset(), and invariant_l_offset().

palloc_btree_page()

static Page palloc_btree_page ( BtreeCheckStatestate,
BlockNumber  blocknum 
)
static

Definition at line 3302 of file verify_nbtree.c.

3303{
3304 Buffer buffer;
3305 Page page;
3306 BTPageOpaque opaque;
3307 OffsetNumber maxoffset;
3308
3309 page = palloc(BLCKSZ);
3310
3311 /*
3312 * We copy the page into local storage to avoid holding pin on the buffer
3313 * longer than we must.
3314 */
3315 buffer = ReadBufferExtended(state->rel, MAIN_FORKNUM, blocknum, RBM_NORMAL,
3316 state->checkstrategy);
3317 LockBuffer(buffer, BT_READ);
3318
3319 /*
3320 * Perform the same basic sanity checking that nbtree itself performs for
3321 * every page:
3322 */
3323 _bt_checkpage(state->rel, buffer);
3324
3325 /* Only use copy of page in palloc()'d memory */
3326 memcpy(page, BufferGetPage(buffer), BLCKSZ);
3327 UnlockReleaseBuffer(buffer);
3328
3329 opaque = BTPageGetOpaque(page);
3330
3331 if (P_ISMETA(opaque) && blocknum != BTREE_METAPAGE)
3332 ereport(ERROR,
3333 (errcode(ERRCODE_INDEX_CORRUPTED),
3334 errmsg("invalid meta page found at block %u in index \"%s\"",
3335 blocknum, RelationGetRelationName(state->rel))));
3336
3337 /* Check page from block that ought to be meta page */
3338 if (blocknum == BTREE_METAPAGE)
3339 {
3340 BTMetaPageData *metad = BTPageGetMeta(page);
3341
3342 if (!P_ISMETA(opaque) ||
3343 metad->btm_magic != BTREE_MAGIC)
3344 ereport(ERROR,
3345 (errcode(ERRCODE_INDEX_CORRUPTED),
3346 errmsg("index \"%s\" meta page is corrupt",
3348
3349 if (metad->btm_version < BTREE_MIN_VERSION ||
3350 metad->btm_version > BTREE_VERSION)
3351 ereport(ERROR,
3352 (errcode(ERRCODE_INDEX_CORRUPTED),
3353 errmsg("version mismatch in index \"%s\": file version %d, "
3354 "current version %d, minimum supported version %d",
3356 metad->btm_version, BTREE_VERSION,
3358
3359 /* Finished with metapage checks */
3360 return page;
3361 }
3362
3363 /*
3364 * Deleted pages that still use the old 32-bit XID representation have no
3365 * sane "level" field because they type pun the field, but all other pages
3366 * (including pages deleted on Postgres 14+) have a valid value.
3367 */
3368 if (!P_ISDELETED(opaque) || P_HAS_FULLXID(opaque))
3369 {
3370 /* Okay, no reason not to trust btpo_level field from page */
3371
3372 if (P_ISLEAF(opaque) && opaque->btpo_level != 0)
3373 ereport(ERROR,
3374 (errcode(ERRCODE_INDEX_CORRUPTED),
3375 errmsg_internal("invalid leaf page level %u for block %u in index \"%s\"",
3376 opaque->btpo_level, blocknum,
3378
3379 if (!P_ISLEAF(opaque) && opaque->btpo_level == 0)
3380 ereport(ERROR,
3381 (errcode(ERRCODE_INDEX_CORRUPTED),
3382 errmsg_internal("invalid internal page level 0 for block %u in index \"%s\"",
3383 blocknum,
3385 }
3386
3387 /*
3388 * Sanity checks for number of items on page.
3389 *
3390 * As noted at the beginning of _bt_binsrch(), an internal page must have
3391 * children, since there must always be a negative infinity downlink
3392 * (there may also be a highkey). In the case of non-rightmost leaf
3393 * pages, there must be at least a highkey. The exceptions are deleted
3394 * pages, which contain no items.
3395 *
3396 * This is correct when pages are half-dead, since internal pages are
3397 * never half-dead, and leaf pages must have a high key when half-dead
3398 * (the rightmost page can never be deleted). It's also correct with
3399 * fully deleted pages: _bt_unlink_halfdead_page() doesn't change anything
3400 * about the target page other than setting the page as fully dead, and
3401 * setting its xact field. In particular, it doesn't change the sibling
3402 * links in the deletion target itself, since they're required when index
3403 * scans land on the deletion target, and then need to move right (or need
3404 * to move left, in the case of backward index scans).
3405 */
3406 maxoffset = PageGetMaxOffsetNumber(page);
3407 if (maxoffset > MaxIndexTuplesPerPage)
3408 ereport(ERROR,
3409 (errcode(ERRCODE_INDEX_CORRUPTED),
3410 errmsg("Number of items on block %u of index \"%s\" exceeds MaxIndexTuplesPerPage (%u)",
3411 blocknum, RelationGetRelationName(state->rel),
3413
3414 if (!P_ISLEAF(opaque) && !P_ISDELETED(opaque) && maxoffset < P_FIRSTDATAKEY(opaque))
3415 ereport(ERROR,
3416 (errcode(ERRCODE_INDEX_CORRUPTED),
3417 errmsg("internal block %u in index \"%s\" lacks high key and/or at least one downlink",
3418 blocknum, RelationGetRelationName(state->rel))));
3419
3420 if (P_ISLEAF(opaque) && !P_ISDELETED(opaque) && !P_RIGHTMOST(opaque) && maxoffset < P_HIKEY)
3421 ereport(ERROR,
3422 (errcode(ERRCODE_INDEX_CORRUPTED),
3423 errmsg("non-rightmost leaf block %u in index \"%s\" lacks high key item",
3424 blocknum, RelationGetRelationName(state->rel))));
3425
3426 /*
3427 * In general, internal pages are never marked half-dead, except on
3428 * versions of Postgres prior to 9.4, where it can be valid transient
3429 * state. This state is nonetheless treated as corruption by VACUUM on
3430 * from version 9.4 on, so do the same here. See _bt_pagedel() for full
3431 * details.
3432 */
3433 if (!P_ISLEAF(opaque) && P_ISHALFDEAD(opaque))
3434 ereport(ERROR,
3435 (errcode(ERRCODE_INDEX_CORRUPTED),
3436 errmsg("internal page block %u in index \"%s\" is half-dead",
3437 blocknum, RelationGetRelationName(state->rel)),
3438 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));
3439
3440 /*
3441 * Check that internal pages have no garbage items, and that no page has
3442 * an invalid combination of deletion-related page level flags
3443 */
3444 if (!P_ISLEAF(opaque) && P_HAS_GARBAGE(opaque))
3445 ereport(ERROR,
3446 (errcode(ERRCODE_INDEX_CORRUPTED),
3447 errmsg_internal("internal page block %u in index \"%s\" has garbage items",
3448 blocknum, RelationGetRelationName(state->rel))));
3449
3450 if (P_HAS_FULLXID(opaque) && !P_ISDELETED(opaque))
3451 ereport(ERROR,
3452 (errcode(ERRCODE_INDEX_CORRUPTED),
3453 errmsg_internal("full transaction id page flag appears in non-deleted block %u in index \"%s\"",
3454 blocknum, RelationGetRelationName(state->rel))));
3455
3456 if (P_ISDELETED(opaque) && P_ISHALFDEAD(opaque))
3457 ereport(ERROR,
3458 (errcode(ERRCODE_INDEX_CORRUPTED),
3459 errmsg_internal("deleted page block %u in index \"%s\" is half-dead",
3460 blocknum, RelationGetRelationName(state->rel))));
3461
3462 return page;
3463}
#define MaxIndexTuplesPerPage
Definition: itup.h:181
#define BTREE_MIN_VERSION
Definition: nbtree.h:152
#define P_HAS_GARBAGE(opaque)
Definition: nbtree.h:227
#define P_ISMETA(opaque)
Definition: nbtree.h:224
#define BTREE_MAGIC
Definition: nbtree.h:150
#define BTREE_VERSION
Definition: nbtree.h:151
uint32 btm_version
Definition: nbtree.h:107
uint32 btm_magic
Definition: nbtree.h:106

References _bt_checkpage(), BT_READ, BTMetaPageData::btm_magic, BTMetaPageData::btm_version, BTPageGetMeta, BTPageGetOpaque, BTPageOpaqueData::btpo_level, BTREE_MAGIC, BTREE_METAPAGE, BTREE_MIN_VERSION, BTREE_VERSION, BufferGetPage(), ereport, errcode(), errhint(), errmsg(), errmsg_internal(), ERROR, LockBuffer(), MAIN_FORKNUM, MaxIndexTuplesPerPage, P_FIRSTDATAKEY, P_HAS_FULLXID, P_HAS_GARBAGE, P_HIKEY, P_ISDELETED, P_ISHALFDEAD, P_ISLEAF, P_ISMETA, P_RIGHTMOST, PageGetMaxOffsetNumber(), palloc(), RBM_NORMAL, ReadBufferExtended(), RelationGetRelationName, and UnlockReleaseBuffer().

Referenced by bt_check_every_level(), bt_check_level_from_leftmost(), bt_child_check(), bt_child_highkey_check(), bt_downlink_missing_check(), bt_leftmost_ignoring_half_dead(), bt_right_page_check_scankey(), and bt_target_page_check().

PG_FUNCTION_INFO_V1() [1/2]

PG_FUNCTION_INFO_V1 ( bt_index_check  )

PG_FUNCTION_INFO_V1() [2/2]

PG_FUNCTION_INFO_V1 ( bt_index_parent_check  )

PG_MODULE_MAGIC_EXT()

PG_MODULE_MAGIC_EXT ( .  name = "amcheck",
version = PG_VERSION 
)

AltStyle によって変換されたページ (->オリジナル) /