[フレーム]

nbtpage.c

1/*-------------------------------------------------------------------------

2 *

3 * nbtpage.c

4 * BTree-specific page management code for the Postgres btree access

5 * method.

6 *

9 *

10 *

11 * IDENTIFICATION

12 * src/backend/access/nbtree/nbtpage.c

13 *

14 * NOTES

15 * Postgres btree pages look like ordinary relation pages. The opaque

16 * data at high addresses includes pointers to left and right siblings

17 * and flag data describing page state. The first page in a btree, page

18 * zero, is special -- it stores meta-information describing the tree.

19 * Pages one and higher store the actual tree data.

20 *

21 *-------------------------------------------------------------------------

22 */

23#include "postgres.h"

24

25#include "access/nbtree.h"

26#include "access/nbtxlog.h"

27#include "access/tableam.h"

28#include "access/transam.h"

29#include "access/xlog.h"

30#include "access/xloginsert.h"

31#include "common/int.h"

32#include "miscadmin.h"

33#include "storage/indexfsm.h"

34#include "storage/predicate.h"

35#include "storage/procarray.h"

36#include "utils/memdebug.h"

37#include "utils/memutils.h"

38#include "utils/snapmgr.h"

39

40static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);

41static void _bt_delitems_delete(Relation rel, Buffer buf,

42 TransactionId snapshotConflictHorizon,

43 bool isCatalogRel,

44 OffsetNumber *deletable, int ndeletable,

45 BTVacuumPosting *updatable, int nupdatable);

46static char *_bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,

47 OffsetNumber *updatedoffsets,

48 Size *updatedbuflen, bool needswal);

49static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel,

50 Buffer leafbuf, BTStack stack);

51static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf,

52 BlockNumber scanblkno,

53 bool *rightsib_empty,

54 BTVacState *vstate);

55static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel,

56 BlockNumber child, BTStack stack,

57 Buffer *subtreeparent, OffsetNumber *poffset,

58 BlockNumber *topparent,

59 BlockNumber *topparentrightsib);

60static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target,

61 FullTransactionId safexid);

62

63/*

64 * _bt_initmetapage() -- Fill a page buffer with a correct metapage image

65 */

66void

67 _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level,

68 bool allequalimage)

69{

70 BTMetaPageData *metad;

71 BTPageOpaque metaopaque;

72

73 _bt_pageinit(page, BLCKSZ);

74

75 metad = BTPageGetMeta(page);

76 metad->btm_magic = BTREE_MAGIC;

77 metad->btm_version = BTREE_VERSION;

78 metad->btm_root = rootbknum;

79 metad->btm_level = level;

80 metad->btm_fastroot = rootbknum;

81 metad->btm_fastlevel = level;

82 metad->btm_last_cleanup_num_delpages = 0;

83 metad->btm_last_cleanup_num_heap_tuples = -1.0;

84 metad->btm_allequalimage = allequalimage;

85

86 metaopaque = BTPageGetOpaque(page);

87 metaopaque->btpo_flags = BTP_META;

88

89 /*

90 * Set pd_lower just past the end of the metadata. This is essential,

91 * because without doing so, metadata will be lost if xlog.c compresses

92 * the page.

93 */

94 ((PageHeader) page)->pd_lower =

95 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;

96}

97

98/*

99 * _bt_upgrademetapage() -- Upgrade a meta-page from an old format to version

100 * 3, the last version that can be updated without broadly affecting

101 * on-disk compatibility. (A REINDEX is required to upgrade to v4.)

102 *

103 * This routine does purely in-memory image upgrade. Caller is

104 * responsible for locking, WAL-logging etc.

105 */

106void

107 _bt_upgrademetapage(Page page)

108{

109 BTMetaPageData *metad;

110 BTPageOpaque metaopaque PG_USED_FOR_ASSERTS_ONLY;

111

112 metad = BTPageGetMeta(page);

113 metaopaque = BTPageGetOpaque(page);

114

115 /* It must be really a meta page of upgradable version */

116 Assert(metaopaque->btpo_flags & BTP_META);

117 Assert(metad->btm_version < BTREE_NOVAC_VERSION);

118 Assert(metad->btm_version >= BTREE_MIN_VERSION);

119

120 /* Set version number and fill extra fields added into version 3 */

121 metad->btm_version = BTREE_NOVAC_VERSION;

122 metad->btm_last_cleanup_num_delpages = 0;

123 metad->btm_last_cleanup_num_heap_tuples = -1.0;

124 /* Only a REINDEX can set this field */

125 Assert(!metad->btm_allequalimage);

126 metad->btm_allequalimage = false;

127

128 /* Adjust pd_lower (see _bt_initmetapage() for details) */

129 ((PageHeader) page)->pd_lower =

130 ((char *) metad + sizeof(BTMetaPageData)) - (char *) page;

131}

132

133/*

134 * Get metadata from share-locked buffer containing metapage, while performing

135 * standard sanity checks.

136 *

137 * Callers that cache data returned here in local cache should note that an

138 * on-the-fly upgrade using _bt_upgrademetapage() can change the version field

139 * and BTREE_NOVAC_VERSION specific fields without invalidating local cache.

140 */

141static BTMetaPageData *

142 _bt_getmeta(Relation rel, Buffer metabuf)

143{

144 Page metapg;

145 BTPageOpaque metaopaque;

146 BTMetaPageData *metad;

147

148 metapg = BufferGetPage(metabuf);

149 metaopaque = BTPageGetOpaque(metapg);

150 metad = BTPageGetMeta(metapg);

151

152 /* sanity-check the metapage */

153 if (!P_ISMETA(metaopaque) ||

154 metad->btm_magic != BTREE_MAGIC)

155 ereport(ERROR,

156 (errcode(ERRCODE_INDEX_CORRUPTED),

157 errmsg("index \"%s\" is not a btree",

158 RelationGetRelationName(rel))));

159

160 if (metad->btm_version < BTREE_MIN_VERSION ||

161 metad->btm_version > BTREE_VERSION)

162 ereport(ERROR,

163 (errcode(ERRCODE_INDEX_CORRUPTED),

164 errmsg("version mismatch in index \"%s\": file version %d, "

165 "current version %d, minimal supported version %d",

166 RelationGetRelationName(rel),

167 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));

168

169 return metad;

170}

171

172/*

173 * _bt_vacuum_needs_cleanup() -- Checks if index needs cleanup

174 *

175 * Called by btvacuumcleanup when btbulkdelete was never called because no

176 * index tuples needed to be deleted.

177 */

178bool

179 _bt_vacuum_needs_cleanup(Relation rel)

180{

181 Buffer metabuf;

182 Page metapg;

183 BTMetaPageData *metad;

184 uint32 btm_version;

185 BlockNumber prev_num_delpages;

186

187 /*

188 * Copy details from metapage to local variables quickly.

189 *

190 * Note that we deliberately avoid using cached version of metapage here.

191 */

192 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);

193 metapg = BufferGetPage(metabuf);

194 metad = BTPageGetMeta(metapg);

195 btm_version = metad->btm_version;

196

197 if (btm_version < BTREE_NOVAC_VERSION)

198 {

199 /*

200 * Metapage needs to be dynamically upgraded to store fields that are

201 * only present when btm_version >= BTREE_NOVAC_VERSION

202 */

203 _bt_relbuf(rel, metabuf);

204 return true;

205 }

206

207 prev_num_delpages = metad->btm_last_cleanup_num_delpages;

208 _bt_relbuf(rel, metabuf);

209

210 /*

211 * Trigger cleanup in rare cases where prev_num_delpages exceeds 5% of the

212 * total size of the index. We can reasonably expect (though are not

213 * guaranteed) to be able to recycle this many pages if we decide to do a

214 * btvacuumscan call during the ongoing btvacuumcleanup. For further

215 * details see the nbtree/README section on placing deleted pages in the

216 * FSM.

217 */

218 if (prev_num_delpages > 0 &&

219 prev_num_delpages > RelationGetNumberOfBlocks(rel) / 20)

220 return true;

221

222 return false;

223}

224

225/*

226 * _bt_set_cleanup_info() -- Update metapage for btvacuumcleanup.

227 *

228 * Called at the end of btvacuumcleanup, when num_delpages value has been

229 * finalized.

230 */

231void

232 _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)

233{

234 Buffer metabuf;

235 Page metapg;

236 BTMetaPageData *metad;

237

238 /*

239 * On-disk compatibility note: The btm_last_cleanup_num_delpages metapage

240 * field started out as a TransactionId field called btm_oldest_btpo_xact.

241 * Both "versions" are just uint32 fields. It was convenient to repurpose

242 * the field when we began to use 64-bit XIDs in deleted pages.

243 *

244 * It's possible that a pg_upgrade'd database will contain an XID value in

245 * what is now recognized as the metapage's btm_last_cleanup_num_delpages

246 * field. _bt_vacuum_needs_cleanup() may even believe that this value

247 * indicates that there are lots of pages that it needs to recycle, when

248 * in reality there are only one or two. The worst that can happen is

249 * that there will be a call to btvacuumscan a little earlier, which will

250 * set btm_last_cleanup_num_delpages to a sane value when we're called.

251 *

252 * Note also that the metapage's btm_last_cleanup_num_heap_tuples field is

253 * no longer used as of PostgreSQL 14. We set it to -1.0 on rewrite, just

254 * to be consistent.

255 */

256 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);

257 metapg = BufferGetPage(metabuf);

258 metad = BTPageGetMeta(metapg);

259

260 /* Don't miss chance to upgrade index/metapage when BTREE_MIN_VERSION */

261 if (metad->btm_version >= BTREE_NOVAC_VERSION &&

262 metad->btm_last_cleanup_num_delpages == num_delpages)

263 {

264 /* Usually means index continues to have num_delpages of 0 */

265 _bt_relbuf(rel, metabuf);

266 return;

267 }

268

269 /* trade in our read lock for a write lock */

270 _bt_unlockbuf(rel, metabuf);

271 _bt_lockbuf(rel, metabuf, BT_WRITE);

272

273 START_CRIT_SECTION();

274

275 /* upgrade meta-page if needed */

276 if (metad->btm_version < BTREE_NOVAC_VERSION)

277 _bt_upgrademetapage(metapg);

278

279 /* update cleanup-related information */

280 metad->btm_last_cleanup_num_delpages = num_delpages;

281 metad->btm_last_cleanup_num_heap_tuples = -1.0;

282 MarkBufferDirty(metabuf);

283

284 /* write wal record if needed */

285 if (RelationNeedsWAL(rel))

286 {

287 xl_btree_metadata md;

288 XLogRecPtr recptr;

289

290 XLogBeginInsert();

291 XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);

292

293 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);

294 md.version = metad->btm_version;

295 md.root = metad->btm_root;

296 md.level = metad->btm_level;

297 md.fastroot = metad->btm_fastroot;

298 md.fastlevel = metad->btm_fastlevel;

299 md.last_cleanup_num_delpages = num_delpages;

300 md.allequalimage = metad->btm_allequalimage;

301

302 XLogRegisterBufData(0, &md, sizeof(xl_btree_metadata));

303

304 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_META_CLEANUP);

305

306 PageSetLSN(metapg, recptr);

307 }

308

309 END_CRIT_SECTION();

310

311 _bt_relbuf(rel, metabuf);

312}

313

314/*

315 * _bt_getroot() -- Get the root page of the btree.

316 *

317 * Since the root page can move around the btree file, we have to read

318 * its location from the metadata page, and then read the root page

319 * itself. If no root page exists yet, we have to create one.

320 *

321 * The access type parameter (BT_READ or BT_WRITE) controls whether

322 * a new root page will be created or not. If access = BT_READ,

323 * and no root page exists, we just return InvalidBuffer. For

324 * BT_WRITE, we try to create the root page if it doesn't exist.

325 * NOTE that the returned root page will have only a read lock set

326 * on it even if access = BT_WRITE!

327 *

328 * If access = BT_WRITE, heaprel must be set; otherwise caller can just

329 * pass NULL. See _bt_allocbuf for an explanation.

330 *

331 * The returned page is not necessarily the true root --- it could be

332 * a "fast root" (a page that is alone in its level due to deletions).

333 * Also, if the root page is split while we are "in flight" to it,

334 * what we will return is the old root, which is now just the leftmost

335 * page on a probably-not-very-wide level. For most purposes this is

336 * as good as or better than the true root, so we do not bother to

337 * insist on finding the true root. We do, however, guarantee to

338 * return a live (not deleted or half-dead) page.

339 *

340 * On successful return, the root page is pinned and read-locked.

341 * The metadata page is not locked or pinned on exit.

342 */

343Buffer

344 _bt_getroot(Relation rel, Relation heaprel, int access)

345{

346 Buffer metabuf;

347 Buffer rootbuf;

348 Page rootpage;

349 BTPageOpaque rootopaque;

350 BlockNumber rootblkno;

351 uint32 rootlevel;

352 BTMetaPageData *metad;

353

354 Assert(access == BT_READ || heaprel != NULL);

355

356 /*

357 * Try to use previously-cached metapage data to find the root. This

358 * normally saves one buffer access per index search, which is a very

359 * helpful savings in bufmgr traffic and hence contention.

360 */

361 if (rel->rd_amcache != NULL)

362 {

363 metad = (BTMetaPageData *) rel->rd_amcache;

364 /* We shouldn't have cached it if any of these fail */

365 Assert(metad->btm_magic == BTREE_MAGIC);

366 Assert(metad->btm_version >= BTREE_MIN_VERSION);

367 Assert(metad->btm_version <= BTREE_VERSION);

368 Assert(!metad->btm_allequalimage ||

369 metad->btm_version > BTREE_NOVAC_VERSION);

370 Assert(metad->btm_root != P_NONE);

371

372 rootblkno = metad->btm_fastroot;

373 Assert(rootblkno != P_NONE);

374 rootlevel = metad->btm_fastlevel;

375

376 rootbuf = _bt_getbuf(rel, rootblkno, BT_READ);

377 rootpage = BufferGetPage(rootbuf);

378 rootopaque = BTPageGetOpaque(rootpage);

379

380 /*

381 * Since the cache might be stale, we check the page more carefully

382 * here than normal. We *must* check that it's not deleted. If it's

383 * not alone on its level, then we reject too --- this may be overly

384 * paranoid but better safe than sorry. Note we don't check P_ISROOT,

385 * because that's not set in a "fast root".

386 */

387 if (!P_IGNORE(rootopaque) &&

388 rootopaque->btpo_level == rootlevel &&

389 P_LEFTMOST(rootopaque) &&

390 P_RIGHTMOST(rootopaque))

391 {

392 /* OK, accept cached page as the root */

393 return rootbuf;

394 }

395 _bt_relbuf(rel, rootbuf);

396 /* Cache is stale, throw it away */

397 if (rel->rd_amcache)

398 pfree(rel->rd_amcache);

399 rel->rd_amcache = NULL;

400 }

401

402 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);

403 metad = _bt_getmeta(rel, metabuf);

404

405 /* if no root page initialized yet, do it */

406 if (metad->btm_root == P_NONE)

407 {

408 Page metapg;

409

410 /* If access = BT_READ, caller doesn't want us to create root yet */

411 if (access == BT_READ)

412 {

413 _bt_relbuf(rel, metabuf);

414 return InvalidBuffer;

415 }

416

417 /* trade in our read lock for a write lock */

418 _bt_unlockbuf(rel, metabuf);

419 _bt_lockbuf(rel, metabuf, BT_WRITE);

420

421 /*

422 * Race condition: if someone else initialized the metadata between

423 * the time we released the read lock and acquired the write lock, we

424 * must avoid doing it again.

425 */

426 if (metad->btm_root != P_NONE)

427 {

428 /*

429 * Metadata initialized by someone else. In order to guarantee no

430 * deadlocks, we have to release the metadata page and start all

431 * over again. (Is that really true? But it's hardly worth trying

432 * to optimize this case.)

433 */

434 _bt_relbuf(rel, metabuf);

435 return _bt_getroot(rel, heaprel, access);

436 }

437

438 /*

439 * Get, initialize, write, and leave a lock of the appropriate type on

440 * the new root page. Since this is the first page in the tree, it's

441 * a leaf as well as the root.

442 */

443 rootbuf = _bt_allocbuf(rel, heaprel);

444 rootblkno = BufferGetBlockNumber(rootbuf);

445 rootpage = BufferGetPage(rootbuf);

446 rootopaque = BTPageGetOpaque(rootpage);

447 rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;

448 rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);

449 rootopaque->btpo_level = 0;

450 rootopaque->btpo_cycleid = 0;

451 /* Get raw page pointer for metapage */

452 metapg = BufferGetPage(metabuf);

453

454 /* NO ELOG(ERROR) till meta is updated */

455 START_CRIT_SECTION();

456

457 /* upgrade metapage if needed */

458 if (metad->btm_version < BTREE_NOVAC_VERSION)

459 _bt_upgrademetapage(metapg);

460

461 metad->btm_root = rootblkno;

462 metad->btm_level = 0;

463 metad->btm_fastroot = rootblkno;

464 metad->btm_fastlevel = 0;

465 metad->btm_last_cleanup_num_delpages = 0;

466 metad->btm_last_cleanup_num_heap_tuples = -1.0;

467

468 MarkBufferDirty(rootbuf);

469 MarkBufferDirty(metabuf);

470

471 /* XLOG stuff */

472 if (RelationNeedsWAL(rel))

473 {

474 xl_btree_newroot xlrec;

475 XLogRecPtr recptr;

476 xl_btree_metadata md;

477

478 XLogBeginInsert();

479 XLogRegisterBuffer(0, rootbuf, REGBUF_WILL_INIT);

480 XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);

481

482 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);

483 md.version = metad->btm_version;

484 md.root = rootblkno;

485 md.level = 0;

486 md.fastroot = rootblkno;

487 md.fastlevel = 0;

488 md.last_cleanup_num_delpages = 0;

489 md.allequalimage = metad->btm_allequalimage;

490

491 XLogRegisterBufData(2, &md, sizeof(xl_btree_metadata));

492

493 xlrec.rootblk = rootblkno;

494 xlrec.level = 0;

495

496 XLogRegisterData(&xlrec, SizeOfBtreeNewroot);

497

498 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT);

499

500 PageSetLSN(rootpage, recptr);

501 PageSetLSN(metapg, recptr);

502 }

503

504 END_CRIT_SECTION();

505

506 /*

507 * swap root write lock for read lock. There is no danger of anyone

508 * else accessing the new root page while it's unlocked, since no one

509 * else knows where it is yet.

510 */

511 _bt_unlockbuf(rel, rootbuf);

512 _bt_lockbuf(rel, rootbuf, BT_READ);

513

514 /* okay, metadata is correct, release lock on it without caching */

515 _bt_relbuf(rel, metabuf);

516 }

517 else

518 {

519 rootblkno = metad->btm_fastroot;

520 Assert(rootblkno != P_NONE);

521 rootlevel = metad->btm_fastlevel;

522

523 /*

524 * Cache the metapage data for next time

525 */

526 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,

527 sizeof(BTMetaPageData));

528 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));

529

530 /*

531 * We are done with the metapage; arrange to release it via first

532 * _bt_relandgetbuf call

533 */

534 rootbuf = metabuf;

535

536 for (;;)

537 {

538 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);

539 rootpage = BufferGetPage(rootbuf);

540 rootopaque = BTPageGetOpaque(rootpage);

541

542 if (!P_IGNORE(rootopaque))

543 break;

544

545 /* it's dead, Jim. step right one page */

546 if (P_RIGHTMOST(rootopaque))

547 elog(ERROR, "no live root page found in index \"%s\"",

548 RelationGetRelationName(rel));

549 rootblkno = rootopaque->btpo_next;

550 }

551

552 if (rootopaque->btpo_level != rootlevel)

553 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",

554 rootblkno, RelationGetRelationName(rel),

555 rootopaque->btpo_level, rootlevel);

556 }

557

558 /*

559 * By here, we have a pin and read lock on the root page, and no lock set

560 * on the metadata page. Return the root page's buffer.

561 */

562 return rootbuf;

563}

564

565/*

566 * _bt_gettrueroot() -- Get the true root page of the btree.

567 *

568 * This is the same as the BT_READ case of _bt_getroot(), except

569 * we follow the true-root link not the fast-root link.

570 *

571 * By the time we acquire lock on the root page, it might have been split and

572 * not be the true root anymore. This is okay for the present uses of this

573 * routine; we only really need to be able to move up at least one tree level

574 * from whatever non-root page we were at. If we ever do need to lock the

575 * one true root page, we could loop here, re-reading the metapage on each

576 * failure. (Note that it wouldn't do to hold the lock on the metapage while

577 * moving to the root --- that'd deadlock against any concurrent root split.)

578 */

579Buffer

580 _bt_gettrueroot(Relation rel)

581{

582 Buffer metabuf;

583 Page metapg;

584 BTPageOpaque metaopaque;

585 Buffer rootbuf;

586 Page rootpage;

587 BTPageOpaque rootopaque;

588 BlockNumber rootblkno;

589 uint32 rootlevel;

590 BTMetaPageData *metad;

591

592 /*

593 * We don't try to use cached metapage data here, since (a) this path is

594 * not performance-critical, and (b) if we are here it suggests our cache

595 * is out-of-date anyway. In light of point (b), it's probably safest to

596 * actively flush any cached metapage info.

597 */

598 if (rel->rd_amcache)

599 pfree(rel->rd_amcache);

600 rel->rd_amcache = NULL;

601

602 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);

603 metapg = BufferGetPage(metabuf);

604 metaopaque = BTPageGetOpaque(metapg);

605 metad = BTPageGetMeta(metapg);

606

607 if (!P_ISMETA(metaopaque) ||

608 metad->btm_magic != BTREE_MAGIC)

609 ereport(ERROR,

610 (errcode(ERRCODE_INDEX_CORRUPTED),

611 errmsg("index \"%s\" is not a btree",

612 RelationGetRelationName(rel))));

613

614 if (metad->btm_version < BTREE_MIN_VERSION ||

615 metad->btm_version > BTREE_VERSION)

616 ereport(ERROR,

617 (errcode(ERRCODE_INDEX_CORRUPTED),

618 errmsg("version mismatch in index \"%s\": file version %d, "

619 "current version %d, minimal supported version %d",

620 RelationGetRelationName(rel),

621 metad->btm_version, BTREE_VERSION, BTREE_MIN_VERSION)));

622

623 /* if no root page initialized yet, fail */

624 if (metad->btm_root == P_NONE)

625 {

626 _bt_relbuf(rel, metabuf);

627 return InvalidBuffer;

628 }

629

630 rootblkno = metad->btm_root;

631 rootlevel = metad->btm_level;

632

633 /*

634 * We are done with the metapage; arrange to release it via first

635 * _bt_relandgetbuf call

636 */

637 rootbuf = metabuf;

638

639 for (;;)

640 {

641 rootbuf = _bt_relandgetbuf(rel, rootbuf, rootblkno, BT_READ);

642 rootpage = BufferGetPage(rootbuf);

643 rootopaque = BTPageGetOpaque(rootpage);

644

645 if (!P_IGNORE(rootopaque))

646 break;

647

648 /* it's dead, Jim. step right one page */

649 if (P_RIGHTMOST(rootopaque))

650 elog(ERROR, "no live root page found in index \"%s\"",

651 RelationGetRelationName(rel));

652 rootblkno = rootopaque->btpo_next;

653 }

654

655 if (rootopaque->btpo_level != rootlevel)

656 elog(ERROR, "root page %u of index \"%s\" has level %u, expected %u",

657 rootblkno, RelationGetRelationName(rel),

658 rootopaque->btpo_level, rootlevel);

659

660 return rootbuf;

661}

662

663/*

664 * _bt_getrootheight() -- Get the height of the btree search tree.

665 *

666 * We return the level (counting from zero) of the current fast root.

667 * This represents the number of tree levels we'd have to descend through

668 * to start any btree index search.

669 *

670 * This is used by the planner for cost-estimation purposes. Since it's

671 * only an estimate, slightly-stale data is fine, hence we don't worry

672 * about updating previously cached data.

673 */

674int

675 _bt_getrootheight(Relation rel)

676{

677 BTMetaPageData *metad;

678

679 if (rel->rd_amcache == NULL)

680 {

681 Buffer metabuf;

682

683 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);

684 metad = _bt_getmeta(rel, metabuf);

685

686 /*

687 * If there's no root page yet, _bt_getroot() doesn't expect a cache

688 * to be made, so just stop here and report the index height is zero.

689 * (XXX perhaps _bt_getroot() should be changed to allow this case.)

690 */

691 if (metad->btm_root == P_NONE)

692 {

693 _bt_relbuf(rel, metabuf);

694 return 0;

695 }

696

697 /*

698 * Cache the metapage data for next time

699 */

700 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,

701 sizeof(BTMetaPageData));

702 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));

703 _bt_relbuf(rel, metabuf);

704 }

705

706 /* Get cached page */

707 metad = (BTMetaPageData *) rel->rd_amcache;

708 /* We shouldn't have cached it if any of these fail */

709 Assert(metad->btm_magic == BTREE_MAGIC);

710 Assert(metad->btm_version >= BTREE_MIN_VERSION);

711 Assert(metad->btm_version <= BTREE_VERSION);

712 Assert(!metad->btm_allequalimage ||

713 metad->btm_version > BTREE_NOVAC_VERSION);

714 Assert(metad->btm_fastroot != P_NONE);

715

716 return metad->btm_fastlevel;

717}

718

719/*

720 * _bt_metaversion() -- Get version/status info from metapage.

721 *

722 * Sets caller's *heapkeyspace and *allequalimage arguments using data

723 * from the B-Tree metapage (could be locally-cached version). This

724 * information needs to be stashed in insertion scankey, so we provide a

725 * single function that fetches both at once.

726 *

727 * This is used to determine the rules that must be used to descend a

728 * btree. Version 4 indexes treat heap TID as a tiebreaker attribute.

729 * pg_upgrade'd version 3 indexes need extra steps to preserve reasonable

730 * performance when inserting a new BTScanInsert-wise duplicate tuple

731 * among many leaf pages already full of such duplicates.

732 *

733 * Also sets allequalimage field, which indicates whether or not it is

734 * safe to apply deduplication. We rely on the assumption that

735 * btm_allequalimage will be zero'ed on heapkeyspace indexes that were

736 * pg_upgrade'd from Postgres 12.

737 */

738void

739 _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)

740{

741 BTMetaPageData *metad;

742

743 if (rel->rd_amcache == NULL)

744 {

745 Buffer metabuf;

746

747 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_READ);

748 metad = _bt_getmeta(rel, metabuf);

749

750 /*

751 * If there's no root page yet, _bt_getroot() doesn't expect a cache

752 * to be made, so just stop here. (XXX perhaps _bt_getroot() should

753 * be changed to allow this case.)

754 */

755 if (metad->btm_root == P_NONE)

756 {

757 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;

758 *allequalimage = metad->btm_allequalimage;

759

760 _bt_relbuf(rel, metabuf);

761 return;

762 }

763

764 /*

765 * Cache the metapage data for next time

766 *

767 * An on-the-fly version upgrade performed by _bt_upgrademetapage()

768 * can change the nbtree version for an index without invalidating any

769 * local cache. This is okay because it can only happen when moving

770 * from version 2 to version 3, both of which are !heapkeyspace

771 * versions.

772 */

773 rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,

774 sizeof(BTMetaPageData));

775 memcpy(rel->rd_amcache, metad, sizeof(BTMetaPageData));

776 _bt_relbuf(rel, metabuf);

777 }

778

779 /* Get cached page */

780 metad = (BTMetaPageData *) rel->rd_amcache;

781 /* We shouldn't have cached it if any of these fail */

782 Assert(metad->btm_magic == BTREE_MAGIC);

783 Assert(metad->btm_version >= BTREE_MIN_VERSION);

784 Assert(metad->btm_version <= BTREE_VERSION);

785 Assert(!metad->btm_allequalimage ||

786 metad->btm_version > BTREE_NOVAC_VERSION);

787 Assert(metad->btm_fastroot != P_NONE);

788

789 *heapkeyspace = metad->btm_version > BTREE_NOVAC_VERSION;

790 *allequalimage = metad->btm_allequalimage;

791}

792

793/*

794 * _bt_checkpage() -- Verify that a freshly-read page looks sane.

795 */

796void

797 _bt_checkpage(Relation rel, Buffer buf)

798{

799 Page page = BufferGetPage(buf);

800

801 /*

802 * ReadBuffer verifies that every newly-read page passes

803 * PageHeaderIsValid, which means it either contains a reasonably sane

804 * page header or is all-zero. We have to defend against the all-zero

805 * case, however.

806 */

807 if (PageIsNew(page))

808 ereport(ERROR,

809 (errcode(ERRCODE_INDEX_CORRUPTED),

810 errmsg("index \"%s\" contains unexpected zero page at block %u",

811 RelationGetRelationName(rel),

812 BufferGetBlockNumber(buf)),

813 errhint("Please REINDEX it.")));

814

815 /*

816 * Additionally check that the special area looks sane.

817 */

818 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(BTPageOpaqueData)))

819 ereport(ERROR,

820 (errcode(ERRCODE_INDEX_CORRUPTED),

821 errmsg("index \"%s\" contains corrupted page at block %u",

822 RelationGetRelationName(rel),

823 BufferGetBlockNumber(buf)),

824 errhint("Please REINDEX it.")));

825}

826

827/*

828 * _bt_getbuf() -- Get an existing block in a buffer, for read or write.

829 *

830 * The general rule in nbtree is that it's never okay to access a

831 * page without holding both a buffer pin and a buffer lock on

832 * the page's buffer.

833 *

834 * When this routine returns, the appropriate lock is set on the

835 * requested buffer and its reference count has been incremented

836 * (ie, the buffer is "locked and pinned"). Also, we apply

837 * _bt_checkpage to sanity-check the page, and perform Valgrind

838 * client requests that help Valgrind detect unsafe page accesses.

839 *

840 * Note: raw LockBuffer() calls are disallowed in nbtree; all

841 * buffer lock requests need to go through wrapper functions such

842 * as _bt_lockbuf().

843 */

844Buffer

845 _bt_getbuf(Relation rel, BlockNumber blkno, int access)

846{

847 Buffer buf;

848

849 Assert(BlockNumberIsValid(blkno));

850

851 /* Read an existing block of the relation */

852 buf = ReadBuffer(rel, blkno);

853 _bt_lockbuf(rel, buf, access);

854 _bt_checkpage(rel, buf);

855

856 return buf;

857}

858

859/*

860 * _bt_allocbuf() -- Allocate a new block/page.

861 *

862 * Returns a write-locked buffer containing an unallocated nbtree page.

863 *

864 * Callers are required to pass a valid heaprel. We need heaprel so that we

865 * can handle generating a snapshotConflictHorizon that makes reusing a page

866 * from the FSM safe for queries that may be running on standbys.

867 */

868Buffer

869 _bt_allocbuf(Relation rel, Relation heaprel)

870{

871 Buffer buf;

872 BlockNumber blkno;

873 Page page;

874

875 Assert(heaprel != NULL);

876

877 /*

878 * First see if the FSM knows of any free pages.

879 *

880 * We can't trust the FSM's report unreservedly; we have to check that the

881 * page is still free. (For example, an already-free page could have been

882 * re-used between the time the last VACUUM scanned it and the time the

883 * VACUUM made its FSM updates.)

884 *

885 * In fact, it's worse than that: we can't even assume that it's safe to

886 * take a lock on the reported page. If somebody else has a lock on it,

887 * or even worse our own caller does, we could deadlock. (The own-caller

888 * scenario is actually not improbable. Consider an index on a serial or

889 * timestamp column. Nearly all splits will be at the rightmost page, so

890 * it's entirely likely that _bt_split will call us while holding a lock

891 * on the page most recently acquired from FSM. A VACUUM running

892 * concurrently with the previous split could well have placed that page

893 * back in FSM.)

894 *

895 * To get around that, we ask for only a conditional lock on the reported

896 * page. If we fail, then someone else is using the page, and we may

897 * reasonably assume it's not free. (If we happen to be wrong, the worst

898 * consequence is the page will be lost to use till the next VACUUM, which

899 * is no big problem.)

900 */

901 for (;;)

902 {

903 blkno = GetFreeIndexPage(rel);

904 if (blkno == InvalidBlockNumber)

905 break;

906 buf = ReadBuffer(rel, blkno);

907 if (_bt_conditionallockbuf(rel, buf))

908 {

909 page = BufferGetPage(buf);

910

911 /*

912 * It's possible to find an all-zeroes page in an index. For

913 * example, a backend might successfully extend the relation one

914 * page and then crash before it is able to make a WAL entry for

915 * adding the page. If we find a zeroed page then reclaim it

916 * immediately.

917 */

918 if (PageIsNew(page))

919 {

920 /* Okay to use page. Initialize and return it. */

921 _bt_pageinit(page, BufferGetPageSize(buf));

922 return buf;

923 }

924

925 if (BTPageIsRecyclable(page, heaprel))

926 {

927 /*

928 * If we are generating WAL for Hot Standby then create a WAL

929 * record that will allow us to conflict with queries running

930 * on standby, in case they have snapshots older than safexid

931 * value

932 */

933 if (RelationNeedsWAL(rel) && XLogStandbyInfoActive())

934 {

935 xl_btree_reuse_page xlrec_reuse;

936

937 /*

938 * Note that we don't register the buffer with the record,

939 * because this operation doesn't modify the page (that

940 * already happened, back when VACUUM deleted the page).

941 * This record only exists to provide a conflict point for

942 * Hot Standby. See record REDO routine comments.

943 */

944 xlrec_reuse.locator = rel->rd_locator;

945 xlrec_reuse.block = blkno;

946 xlrec_reuse.snapshotConflictHorizon = BTPageGetDeleteXid(page);

947 xlrec_reuse.isCatalogRel =

948 RelationIsAccessibleInLogicalDecoding(heaprel);

949

950 XLogBeginInsert();

951 XLogRegisterData(&xlrec_reuse, SizeOfBtreeReusePage);

952

953 XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE);

954 }

955

956 /* Okay to use page. Re-initialize and return it. */

957 _bt_pageinit(page, BufferGetPageSize(buf));

958 return buf;

959 }

960 elog(DEBUG2, "FSM returned nonrecyclable page");

961 _bt_relbuf(rel, buf);

962 }

963 else

964 {

965 elog(DEBUG2, "FSM returned nonlockable page");

966 /* couldn't get lock, so just drop pin */

967 ReleaseBuffer(buf);

968 }

969 }

970

971 /*

972 * Extend the relation by one page. Need to use RBM_ZERO_AND_LOCK or we

973 * risk a race condition against btvacuumscan --- see comments therein.

974 * This forces us to repeat the valgrind request that _bt_lockbuf()

975 * otherwise would make, as we can't use _bt_lockbuf() without introducing

976 * a race.

977 */

978 buf = ExtendBufferedRel(BMR_REL(rel), MAIN_FORKNUM, NULL, EB_LOCK_FIRST);

979 if (!RelationUsesLocalBuffers(rel))

980 VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);

981

982 /* Initialize the new page before returning it */

983 page = BufferGetPage(buf);

984 Assert(PageIsNew(page));

985 _bt_pageinit(page, BufferGetPageSize(buf));

986

987 return buf;

988}

989

990/*

991 * _bt_relandgetbuf() -- release a locked buffer and get another one.

992 *

993 * This is equivalent to _bt_relbuf followed by _bt_getbuf. Also, if obuf is

994 * InvalidBuffer then it reduces to just _bt_getbuf; allowing this case

995 * simplifies some callers.

996 *

997 * The original motivation for using this was to avoid two entries to the

998 * bufmgr when one would do. However, now it's mainly just a notational

999 * convenience. The only case where it saves work over _bt_relbuf/_bt_getbuf

1000 * is when the target page is the same one already in the buffer.

1001 */

1002Buffer

1003 _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)

1004{

1005 Buffer buf;

1006

1007 Assert(BlockNumberIsValid(blkno));

1008 if (BufferIsValid(obuf))

1009 _bt_unlockbuf(rel, obuf);

1010 buf = ReleaseAndReadBuffer(obuf, rel, blkno);

1011 _bt_lockbuf(rel, buf, access);

1012

1013 _bt_checkpage(rel, buf);

1014 return buf;

1015}

1016

1017/*

1018 * _bt_relbuf() -- release a locked buffer.

1019 *

1020 * Lock and pin (refcount) are both dropped.

1021 */

1022void

1023 _bt_relbuf(Relation rel, Buffer buf)

1024{

1025 _bt_unlockbuf(rel, buf);

1026 ReleaseBuffer(buf);

1027}

1028

1029/*

1030 * _bt_lockbuf() -- lock a pinned buffer.

1031 *

1032 * Lock is acquired without acquiring another pin. This is like a raw

1033 * LockBuffer() call, but performs extra steps needed by Valgrind.

1034 *

1035 * Note: Caller may need to call _bt_checkpage() with buf when pin on buf

1036 * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().

1037 */

1038void

1039 _bt_lockbuf(Relation rel, Buffer buf, int access)

1040{

1041 /* LockBuffer() asserts that pin is held by this backend */

1042 LockBuffer(buf, access);

1043

1044 /*

1045 * It doesn't matter that _bt_unlockbuf() won't get called in the event of

1046 * an nbtree error (e.g. a unique violation error). That won't cause

1047 * Valgrind false positives.

1048 *

1049 * The nbtree client requests are superimposed on top of the bufmgr.c

1050 * buffer pin client requests. In the event of an nbtree error the buffer

1051 * will certainly get marked as defined when the backend once again

1052 * acquires its first pin on the buffer. (Of course, if the backend never

1053 * touches the buffer again then it doesn't matter that it remains

1054 * non-accessible to Valgrind.)

1055 *

1056 * Note: When an IndexTuple C pointer gets computed using an ItemId read

1057 * from a page while a lock was held, the C pointer becomes unsafe to

1058 * dereference forever as soon as the lock is released. Valgrind can only

1059 * detect cases where the pointer gets dereferenced with no _current_

1060 * lock/pin held, though.

1061 */

1062 if (!RelationUsesLocalBuffers(rel))

1063 VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);

1064}

1065

1066/*

1067 * _bt_unlockbuf() -- unlock a pinned buffer.

1068 */

1069void

1070 _bt_unlockbuf(Relation rel, Buffer buf)

1071{

1072 /*

1073 * Buffer is pinned and locked, which means that it is expected to be

1074 * defined and addressable. Check that proactively.

1075 */

1076 VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);

1077

1078 /* LockBuffer() asserts that pin is held by this backend */

1079 LockBuffer(buf, BUFFER_LOCK_UNLOCK);

1080

1081 if (!RelationUsesLocalBuffers(rel))

1082 VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);

1083}

1084

1085/*

1086 * _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned

1087 * buffer.

1088 *

1089 * Note: Caller may need to call _bt_checkpage() with buf when pin on buf

1090 * wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().

1091 */

1092bool

1093 _bt_conditionallockbuf(Relation rel, Buffer buf)

1094{

1095 /* ConditionalLockBuffer() asserts that pin is held by this backend */

1096 if (!ConditionalLockBuffer(buf))

1097 return false;

1098

1099 if (!RelationUsesLocalBuffers(rel))

1100 VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);

1101

1102 return true;

1103}

1104

1105/*

1106 * _bt_upgradelockbufcleanup() -- upgrade lock to a full cleanup lock.

1107 */

1108void

1109 _bt_upgradelockbufcleanup(Relation rel, Buffer buf)

1110{

1111 /*

1112 * Buffer is pinned and locked, which means that it is expected to be

1113 * defined and addressable. Check that proactively.

1114 */

1115 VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);

1116

1117 /* LockBuffer() asserts that pin is held by this backend */

1118 LockBuffer(buf, BUFFER_LOCK_UNLOCK);

1119 LockBufferForCleanup(buf);

1120}

1121

1122/*

1123 * _bt_pageinit() -- Initialize a new page.

1124 *

1125 * On return, the page header is initialized; data space is empty;

1126 * special space is zeroed out.

1127 */

1128void

1129 _bt_pageinit(Page page, Size size)

1130{

1131 PageInit(page, size, sizeof(BTPageOpaqueData));

1132}

1133

1134/*

1135 * Delete item(s) from a btree leaf page during VACUUM.

1136 *

1137 * This routine assumes that the caller already has a full cleanup lock on

1138 * the buffer. Also, the given deletable and updatable arrays *must* be

1139 * sorted in ascending order.

1140 *

1141 * Routine deals with deleting TIDs when some (but not all) of the heap TIDs

1142 * in an existing posting list item are to be removed. This works by

1143 * updating/overwriting an existing item with caller's new version of the item

1144 * (a version that lacks the TIDs that are to be deleted).

1145 *

1146 * We record VACUUMs and b-tree deletes differently in WAL. Deletes must

1147 * generate their own snapshotConflictHorizon directly from the tableam,

1148 * whereas VACUUMs rely on the initial VACUUM table scan performing

1149 * WAL-logging that takes care of the issue for the table's indexes

1150 * indirectly. Also, we remove the VACUUM cycle ID from pages, which b-tree

1151 * deletes don't do.

1152 */

1153void

1154 _bt_delitems_vacuum(Relation rel, Buffer buf,

1155 OffsetNumber *deletable, int ndeletable,

1156 BTVacuumPosting *updatable, int nupdatable)

1157{

1158 Page page = BufferGetPage(buf);

1159 BTPageOpaque opaque;

1160 bool needswal = RelationNeedsWAL(rel);

1161 char *updatedbuf = NULL;

1162 Size updatedbuflen = 0;

1163 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];

1164

1165 /* Shouldn't be called unless there's something to do */

1166 Assert(ndeletable > 0 || nupdatable > 0);

1167

1168 /* Generate new version of posting lists without deleted TIDs */

1169 if (nupdatable > 0)

1170 updatedbuf = _bt_delitems_update(updatable, nupdatable,

1171 updatedoffsets, &updatedbuflen,

1172 needswal);

1173

1174 /* No ereport(ERROR) until changes are logged */

1175 START_CRIT_SECTION();

1176

1177 /*

1178 * Handle posting tuple updates.

1179 *

1180 * Deliberately do this before handling simple deletes. If we did it the

1181 * other way around (i.e. WAL record order -- simple deletes before

1182 * updates) then we'd have to make compensating changes to the 'updatable'

1183 * array of offset numbers.

1184 *

1185 * PageIndexTupleOverwrite() won't unset each item's LP_DEAD bit when it

1186 * happens to already be set. It's important that we not interfere with

1187 * any future simple index tuple deletion operations.

1188 */

1189 for (int i = 0; i < nupdatable; i++)

1190 {

1191 OffsetNumber updatedoffset = updatedoffsets[i];

1192 IndexTuple itup;

1193 Size itemsz;

1194

1195 itup = updatable[i]->itup;

1196 itemsz = MAXALIGN(IndexTupleSize(itup));

1197 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,

1198 itemsz))

1199 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",

1200 BufferGetBlockNumber(buf), RelationGetRelationName(rel));

1201 }

1202

1203 /* Now handle simple deletes of entire tuples */

1204 if (ndeletable > 0)

1205 PageIndexMultiDelete(page, deletable, ndeletable);

1206

1207 /*

1208 * We can clear the vacuum cycle ID since this page has certainly been

1209 * processed by the current vacuum scan.

1210 */

1211 opaque = BTPageGetOpaque(page);

1212 opaque->btpo_cycleid = 0;

1213

1214 /*

1215 * Clear the BTP_HAS_GARBAGE page flag.

1216 *

1217 * This flag indicates the presence of LP_DEAD items on the page (though

1218 * not reliably). Note that we only rely on it with pg_upgrade'd

1219 * !heapkeyspace indexes. That's why clearing it here won't usually

1220 * interfere with simple index tuple deletion.

1221 */

1222 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;

1223

1224 MarkBufferDirty(buf);

1225

1226 /* XLOG stuff */

1227 if (needswal)

1228 {

1229 XLogRecPtr recptr;

1230 xl_btree_vacuum xlrec_vacuum;

1231

1232 xlrec_vacuum.ndeleted = ndeletable;

1233 xlrec_vacuum.nupdated = nupdatable;

1234

1235 XLogBeginInsert();

1236 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);

1237 XLogRegisterData(&xlrec_vacuum, SizeOfBtreeVacuum);

1238

1239 if (ndeletable > 0)

1240 XLogRegisterBufData(0, deletable,

1241 ndeletable * sizeof(OffsetNumber));

1242

1243 if (nupdatable > 0)

1244 {

1245 XLogRegisterBufData(0, updatedoffsets,

1246 nupdatable * sizeof(OffsetNumber));

1247 XLogRegisterBufData(0, updatedbuf, updatedbuflen);

1248 }

1249

1250 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);

1251

1252 PageSetLSN(page, recptr);

1253 }

1254

1255 END_CRIT_SECTION();

1256

1257 /* can't leak memory here */

1258 if (updatedbuf != NULL)

1259 pfree(updatedbuf);

1260 /* free tuples allocated within _bt_delitems_update() */

1261 for (int i = 0; i < nupdatable; i++)

1262 pfree(updatable[i]->itup);

1263}

1264

1265/*

1266 * Delete item(s) from a btree leaf page during single-page cleanup.

1267 *

1268 * This routine assumes that the caller has pinned and write locked the

1269 * buffer. Also, the given deletable and updatable arrays *must* be sorted in

1270 * ascending order.

1271 *

1272 * Routine deals with deleting TIDs when some (but not all) of the heap TIDs

1273 * in an existing posting list item are to be removed. This works by

1274 * updating/overwriting an existing item with caller's new version of the item

1275 * (a version that lacks the TIDs that are to be deleted).

1276 *

1277 * This is nearly the same as _bt_delitems_vacuum as far as what it does to

1278 * the page, but it needs its own snapshotConflictHorizon and isCatalogRel

1279 * (from the tableam). This is used by the REDO routine to generate recovery

1280 * conflicts. The other difference is that only _bt_delitems_vacuum will

1281 * clear page's VACUUM cycle ID.

1282 */

1283static void

1284 _bt_delitems_delete(Relation rel, Buffer buf,

1285 TransactionId snapshotConflictHorizon, bool isCatalogRel,

1286 OffsetNumber *deletable, int ndeletable,

1287 BTVacuumPosting *updatable, int nupdatable)

1288{

1289 Page page = BufferGetPage(buf);

1290 BTPageOpaque opaque;

1291 bool needswal = RelationNeedsWAL(rel);

1292 char *updatedbuf = NULL;

1293 Size updatedbuflen = 0;

1294 OffsetNumber updatedoffsets[MaxIndexTuplesPerPage];

1295

1296 /* Shouldn't be called unless there's something to do */

1297 Assert(ndeletable > 0 || nupdatable > 0);

1298

1299 /* Generate new versions of posting lists without deleted TIDs */

1300 if (nupdatable > 0)

1301 updatedbuf = _bt_delitems_update(updatable, nupdatable,

1302 updatedoffsets, &updatedbuflen,

1303 needswal);

1304

1305 /* No ereport(ERROR) until changes are logged */

1306 START_CRIT_SECTION();

1307

1308 /* Handle updates and deletes just like _bt_delitems_vacuum */

1309 for (int i = 0; i < nupdatable; i++)

1310 {

1311 OffsetNumber updatedoffset = updatedoffsets[i];

1312 IndexTuple itup;

1313 Size itemsz;

1314

1315 itup = updatable[i]->itup;

1316 itemsz = MAXALIGN(IndexTupleSize(itup));

1317 if (!PageIndexTupleOverwrite(page, updatedoffset, (Item) itup,

1318 itemsz))

1319 elog(PANIC, "failed to update partially dead item in block %u of index \"%s\"",

1320 BufferGetBlockNumber(buf), RelationGetRelationName(rel));

1321 }

1322

1323 if (ndeletable > 0)

1324 PageIndexMultiDelete(page, deletable, ndeletable);

1325

1326 /*

1327 * Unlike _bt_delitems_vacuum, we *must not* clear the vacuum cycle ID at

1328 * this point. The VACUUM command alone controls vacuum cycle IDs.

1329 */

1330 opaque = BTPageGetOpaque(page);

1331

1332 /*

1333 * Clear the BTP_HAS_GARBAGE page flag.

1334 *

1335 * This flag indicates the presence of LP_DEAD items on the page (though

1336 * not reliably). Note that we only rely on it with pg_upgrade'd

1337 * !heapkeyspace indexes.

1338 */

1339 opaque->btpo_flags &= ~BTP_HAS_GARBAGE;

1340

1341 MarkBufferDirty(buf);

1342

1343 /* XLOG stuff */

1344 if (needswal)

1345 {

1346 XLogRecPtr recptr;

1347 xl_btree_delete xlrec_delete;

1348

1349 xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon;

1350 xlrec_delete.ndeleted = ndeletable;

1351 xlrec_delete.nupdated = nupdatable;

1352 xlrec_delete.isCatalogRel = isCatalogRel;

1353

1354 XLogBeginInsert();

1355 XLogRegisterBuffer(0, buf, REGBUF_STANDARD);

1356 XLogRegisterData(&xlrec_delete, SizeOfBtreeDelete);

1357

1358 if (ndeletable > 0)

1359 XLogRegisterBufData(0, deletable,

1360 ndeletable * sizeof(OffsetNumber));

1361

1362 if (nupdatable > 0)

1363 {

1364 XLogRegisterBufData(0, updatedoffsets,

1365 nupdatable * sizeof(OffsetNumber));

1366 XLogRegisterBufData(0, updatedbuf, updatedbuflen);

1367 }

1368

1369 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE);

1370

1371 PageSetLSN(page, recptr);

1372 }

1373

1374 END_CRIT_SECTION();

1375

1376 /* can't leak memory here */

1377 if (updatedbuf != NULL)

1378 pfree(updatedbuf);

1379 /* free tuples allocated within _bt_delitems_update() */

1380 for (int i = 0; i < nupdatable; i++)

1381 pfree(updatable[i]->itup);

1382}

1383

1384/*

1385 * Set up state needed to delete TIDs from posting list tuples via "updating"

1386 * the tuple. Performs steps common to both _bt_delitems_vacuum and

1387 * _bt_delitems_delete. These steps must take place before each function's

1388 * critical section begins.

1389 *

1390 * updatable and nupdatable are inputs, though note that we will use

1391 * _bt_update_posting() to replace the original itup with a pointer to a final

1392 * version in palloc()'d memory. Caller should free the tuples when its done.

1393 *

1394 * The first nupdatable entries from updatedoffsets are set to the page offset

1395 * number for posting list tuples that caller updates. This is mostly useful

1396 * because caller may need to WAL-log the page offsets (though we always do

1397 * this for caller out of convenience).

1398 *

1399 * Returns buffer consisting of an array of xl_btree_update structs that

1400 * describe the steps we perform here for caller (though only when needswal is

1401 * true). Also sets *updatedbuflen to the final size of the buffer. This

1402 * buffer is used by caller when WAL logging is required.

1403 */

1404static char *

1405 _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable,

1406 OffsetNumber *updatedoffsets, Size *updatedbuflen,

1407 bool needswal)

1408{

1409 char *updatedbuf = NULL;

1410 Size buflen = 0;

1411

1412 /* Shouldn't be called unless there's something to do */

1413 Assert(nupdatable > 0);

1414

1415 for (int i = 0; i < nupdatable; i++)

1416 {

1417 BTVacuumPosting vacposting = updatable[i];

1418 Size itemsz;

1419

1420 /* Replace work area IndexTuple with updated version */

1421 _bt_update_posting(vacposting);

1422

1423 /* Keep track of size of xl_btree_update for updatedbuf in passing */

1424 itemsz = SizeOfBtreeUpdate + vacposting->ndeletedtids * sizeof(uint16);

1425 buflen += itemsz;

1426

1427 /* Build updatedoffsets buffer in passing */

1428 updatedoffsets[i] = vacposting->updatedoffset;

1429 }

1430

1431 /* XLOG stuff */

1432 if (needswal)

1433 {

1434 Size offset = 0;

1435

1436 /* Allocate, set final size for caller */

1437 updatedbuf = palloc(buflen);

1438 *updatedbuflen = buflen;

1439 for (int i = 0; i < nupdatable; i++)

1440 {

1441 BTVacuumPosting vacposting = updatable[i];

1442 Size itemsz;

1443 xl_btree_update update;

1444

1445 update.ndeletedtids = vacposting->ndeletedtids;

1446 memcpy(updatedbuf + offset, &update.ndeletedtids,

1447 SizeOfBtreeUpdate);

1448 offset += SizeOfBtreeUpdate;

1449

1450 itemsz = update.ndeletedtids * sizeof(uint16);

1451 memcpy(updatedbuf + offset, vacposting->deletetids, itemsz);

1452 offset += itemsz;

1453 }

1454 }

1455

1456 return updatedbuf;

1457}

1458

1459/*

1460 * Comparator used by _bt_delitems_delete_check() to restore deltids array

1461 * back to its original leaf-page-wise sort order

1462 */

1463static int

1464 _bt_delitems_cmp(const void *a, const void *b)

1465{

1466 TM_IndexDelete *indexdelete1 = (TM_IndexDelete *) a;

1467 TM_IndexDelete *indexdelete2 = (TM_IndexDelete *) b;

1468

1469 Assert(indexdelete1->id != indexdelete2->id);

1470

1471 return pg_cmp_s16(indexdelete1->id, indexdelete2->id);

1472}

1473

1474/*

1475 * Try to delete item(s) from a btree leaf page during single-page cleanup.

1476 *

1477 * nbtree interface to table_index_delete_tuples(). Deletes a subset of index

1478 * tuples from caller's deltids array: those whose TIDs are found safe to

1479 * delete by the tableam (or already marked LP_DEAD in index, and so already

1480 * known to be deletable by our simple index deletion caller). We physically

1481 * delete index tuples from buf leaf page last of all (for index tuples where

1482 * that is known to be safe following our table_index_delete_tuples() call).

1483 *

1484 * Simple index deletion caller only includes TIDs from index tuples marked

1485 * LP_DEAD, as well as extra TIDs it found on the same leaf page that can be

1486 * included without increasing the total number of distinct table blocks for

1487 * the deletion operation as a whole. This approach often allows us to delete

1488 * some extra index tuples that were practically free for tableam to check in

1489 * passing (when they actually turn out to be safe to delete). It probably

1490 * only makes sense for the tableam to go ahead with these extra checks when

1491 * it is block-oriented (otherwise the checks probably won't be practically

1492 * free, which we rely on). The tableam interface requires the tableam side

1493 * to handle the problem, though, so this is okay (we as an index AM are free

1494 * to make the simplifying assumption that all tableams must be block-based).

1495 *

1496 * Bottom-up index deletion caller provides all the TIDs from the leaf page,

1497 * without expecting that tableam will check most of them. The tableam has

1498 * considerable discretion around which entries/blocks it checks. Our role in

1499 * costing the bottom-up deletion operation is strictly advisory.

1500 *

1501 * Note: Caller must have added deltids entries (i.e. entries that go in

1502 * delstate's main array) in leaf-page-wise order: page offset number order,

1503 * TID order among entries taken from the same posting list tuple (tiebreak on

1504 * TID). This order is convenient to work with here.

1505 *

1506 * Note: We also rely on the id field of each deltids element "capturing" this

1507 * original leaf-page-wise order. That is, we expect to be able to get back

1508 * to the original leaf-page-wise order just by sorting deltids on the id

1509 * field (tableam will sort deltids for its own reasons, so we'll need to put

1510 * it back in leaf-page-wise order afterwards).

1511 */

1512void

1513 _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel,

1514 TM_IndexDeleteOp *delstate)

1515{

1516 Page page = BufferGetPage(buf);

1517 TransactionId snapshotConflictHorizon;

1518 bool isCatalogRel;

1519 OffsetNumber postingidxoffnum = InvalidOffsetNumber;

1520 int ndeletable = 0,

1521 nupdatable = 0;

1522 OffsetNumber deletable[MaxIndexTuplesPerPage];

1523 BTVacuumPosting updatable[MaxIndexTuplesPerPage];

1524

1525 /* Use tableam interface to determine which tuples to delete first */

1526 snapshotConflictHorizon = table_index_delete_tuples(heapRel, delstate);

1527 isCatalogRel = RelationIsAccessibleInLogicalDecoding(heapRel);

1528

1529 /* Should not WAL-log snapshotConflictHorizon unless it's required */

1530 if (!XLogStandbyInfoActive())

1531 snapshotConflictHorizon = InvalidTransactionId;

1532

1533 /*

1534 * Construct a leaf-page-wise description of what _bt_delitems_delete()

1535 * needs to do to physically delete index tuples from the page.

1536 *

1537 * Must sort deltids array to restore leaf-page-wise order (original order

1538 * before call to tableam). This is the order that the loop expects.

1539 *

1540 * Note that deltids array might be a lot smaller now. It might even have

1541 * no entries at all (with bottom-up deletion caller), in which case there

1542 * is nothing left to do.

1543 */

1544 qsort(delstate->deltids, delstate->ndeltids, sizeof(TM_IndexDelete),

1545 _bt_delitems_cmp);

1546 if (delstate->ndeltids == 0)

1547 {

1548 Assert(delstate->bottomup);

1549 return;

1550 }

1551

1552 /* We definitely have to delete at least one index tuple (or one TID) */

1553 for (int i = 0; i < delstate->ndeltids; i++)

1554 {

1555 TM_IndexStatus *dstatus = delstate->status + delstate->deltids[i].id;

1556 OffsetNumber idxoffnum = dstatus->idxoffnum;

1557 ItemId itemid = PageGetItemId(page, idxoffnum);

1558 IndexTuple itup = (IndexTuple) PageGetItem(page, itemid);

1559 int nestedi,

1560 nitem;

1561 BTVacuumPosting vacposting;

1562

1563 Assert(OffsetNumberIsValid(idxoffnum));

1564

1565 if (idxoffnum == postingidxoffnum)

1566 {

1567 /*

1568 * This deltid entry is a TID from a posting list tuple that has

1569 * already been completely processed

1570 */

1571 Assert(BTreeTupleIsPosting(itup));

1572 Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),

1573 &delstate->deltids[i].tid) < 0);

1574 Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(itup),

1575 &delstate->deltids[i].tid) >= 0);

1576 continue;

1577 }

1578

1579 if (!BTreeTupleIsPosting(itup))

1580 {

1581 /* Plain non-pivot tuple */

1582 Assert(ItemPointerEquals(&itup->t_tid, &delstate->deltids[i].tid));

1583 if (dstatus->knowndeletable)

1584 deletable[ndeletable++] = idxoffnum;

1585 continue;

1586 }

1587

1588 /*

1589 * itup is a posting list tuple whose lowest deltids entry (which may

1590 * or may not be for the first TID from itup) is considered here now.

1591 * We should process all of the deltids entries for the posting list

1592 * together now, though (not just the lowest). Remember to skip over

1593 * later itup-related entries during later iterations of outermost

1594 * loop.

1595 */

1596 postingidxoffnum = idxoffnum; /* Remember work in outermost loop */

1597 nestedi = i; /* Initialize for first itup deltids entry */

1598 vacposting = NULL; /* Describes final action for itup */

1599 nitem = BTreeTupleGetNPosting(itup);

1600 for (int p = 0; p < nitem; p++)

1601 {

1602 ItemPointer ptid = BTreeTupleGetPostingN(itup, p);

1603 int ptidcmp = -1;

1604

1605 /*

1606 * This nested loop reuses work across ptid TIDs taken from itup.

1607 * We take advantage of the fact that both itup's TIDs and deltids

1608 * entries (within a single itup/posting list grouping) must both

1609 * be in ascending TID order.

1610 */

1611 for (; nestedi < delstate->ndeltids; nestedi++)

1612 {

1613 TM_IndexDelete *tcdeltid = &delstate->deltids[nestedi];

1614 TM_IndexStatus *tdstatus = (delstate->status + tcdeltid->id);

1615

1616 /* Stop once we get past all itup related deltids entries */

1617 Assert(tdstatus->idxoffnum >= idxoffnum);

1618 if (tdstatus->idxoffnum != idxoffnum)

1619 break;

1620

1621 /* Skip past non-deletable itup related entries up front */

1622 if (!tdstatus->knowndeletable)

1623 continue;

1624

1625 /* Entry is first partial ptid match (or an exact match)? */

1626 ptidcmp = ItemPointerCompare(&tcdeltid->tid, ptid);

1627 if (ptidcmp >= 0)

1628 {

1629 /* Greater than or equal (partial or exact) match... */

1630 break;

1631 }

1632 }

1633

1634 /* ...exact ptid match to a deletable deltids entry? */

1635 if (ptidcmp != 0)

1636 continue;

1637

1638 /* Exact match for deletable deltids entry -- ptid gets deleted */

1639 if (vacposting == NULL)

1640 {

1641 vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) +

1642 nitem * sizeof(uint16));

1643 vacposting->itup = itup;

1644 vacposting->updatedoffset = idxoffnum;

1645 vacposting->ndeletedtids = 0;

1646 }

1647 vacposting->deletetids[vacposting->ndeletedtids++] = p;

1648 }

1649

1650 /* Final decision on itup, a posting list tuple */

1651

1652 if (vacposting == NULL)

1653 {

1654 /* No TIDs to delete from itup -- do nothing */

1655 }

1656 else if (vacposting->ndeletedtids == nitem)

1657 {

1658 /* Straight delete of itup (to delete all TIDs) */

1659 deletable[ndeletable++] = idxoffnum;

1660 /* Turns out we won't need granular information */

1661 pfree(vacposting);

1662 }

1663 else

1664 {

1665 /* Delete some (but not all) TIDs from itup */

1666 Assert(vacposting->ndeletedtids > 0 &&

1667 vacposting->ndeletedtids < nitem);

1668 updatable[nupdatable++] = vacposting;

1669 }

1670 }

1671

1672 /* Physically delete tuples (or TIDs) using deletable (or updatable) */

1673 _bt_delitems_delete(rel, buf, snapshotConflictHorizon, isCatalogRel,

1674 deletable, ndeletable, updatable, nupdatable);

1675

1676 /* be tidy */

1677 for (int i = 0; i < nupdatable; i++)

1678 pfree(updatable[i]);

1679}

1680

1681/*

1682 * Check that leftsib page (the btpo_prev of target page) is not marked with

1683 * INCOMPLETE_SPLIT flag. Used during page deletion.

1684 *

1685 * Returning true indicates that page flag is set in leftsib (which is

1686 * definitely still the left sibling of target). When that happens, the

1687 * target doesn't have a downlink in parent, and the page deletion algorithm

1688 * isn't prepared to handle that. Deletion of the target page (or the whole

1689 * subtree that contains the target page) cannot take place.

1690 *

1691 * Caller should not have a lock on the target page itself, since pages on the

1692 * same level must always be locked left to right to avoid deadlocks.

1693 */

1694static bool

1695 _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)

1696{

1697 Buffer buf;

1698 Page page;

1699 BTPageOpaque opaque;

1700 bool result;

1701

1702 /* Easy case: No left sibling */

1703 if (leftsib == P_NONE)

1704 return false;

1705

1706 buf = _bt_getbuf(rel, leftsib, BT_READ);

1707 page = BufferGetPage(buf);

1708 opaque = BTPageGetOpaque(page);

1709

1710 /*

1711 * If the left sibling was concurrently split, so that its next-pointer

1712 * doesn't point to the current page anymore, the split that created

1713 * target must be completed. Caller can reasonably expect that there will

1714 * be a downlink to the target page that it can relocate using its stack.

1715 * (We don't allow splitting an incompletely split page again until the

1716 * previous split has been completed.)

1717 */

1718 result = (opaque->btpo_next == target && P_INCOMPLETE_SPLIT(opaque));

1719 _bt_relbuf(rel, buf);

1720

1721 return result;

1722}

1723

1724/*

1725 * Check that leafrightsib page (the btpo_next of target leaf page) is not

1726 * marked with ISHALFDEAD flag. Used during page deletion.

1727 *

1728 * Returning true indicates that page flag is set in leafrightsib, so page

1729 * deletion cannot go ahead. Our caller is not prepared to deal with the case

1730 * where the parent page does not have a pivot tuples whose downlink points to

1731 * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't

1732 * seem worth going to the trouble of teaching our caller to deal with it.

1733 * The situation will be resolved after VACUUM finishes the deletion of the

1734 * half-dead page (when a future VACUUM operation reaches the target page

1735 * again).

1736 *

1737 * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.

1738 * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is

1739 * okay because of the restriction on deleting pages that are the rightmost

1740 * page of their parent (i.e. that such deletions can only take place when the

1741 * entire subtree must be deleted). The leaf level check made here will apply

1742 * to a right "cousin" leaf page rather than a simple right sibling leaf page

1743 * in cases where caller actually goes on to attempt deleting pages that are

1744 * above the leaf page. The right cousin leaf page is representative of the

1745 * left edge of the subtree to the right of the to-be-deleted subtree as a

1746 * whole, which is exactly the condition that our caller cares about.

1747 * (Besides, internal pages are never marked half-dead, so it isn't even

1748 * possible to _directly_ assess if an internal page is part of some other

1749 * to-be-deleted subtree.)

1750 */

1751static bool

1752 _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)

1753{

1754 Buffer buf;

1755 Page page;

1756 BTPageOpaque opaque;

1757 bool result;

1758

1759 Assert(leafrightsib != P_NONE);

1760

1761 buf = _bt_getbuf(rel, leafrightsib, BT_READ);

1762 page = BufferGetPage(buf);

1763 opaque = BTPageGetOpaque(page);

1764

1765 Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque));

1766 result = P_ISHALFDEAD(opaque);

1767 _bt_relbuf(rel, buf);

1768

1769 return result;

1770}

1771

1772/*

1773 * _bt_pagedel() -- Delete a leaf page from the b-tree, if legal to do so.

1774 *

1775 * This action unlinks the leaf page from the b-tree structure, removing all

1776 * pointers leading to it --- but not touching its own left and right links.

1777 * The page cannot be physically reclaimed right away, since other processes

1778 * may currently be trying to follow links leading to the page; they have to

1779 * be allowed to use its right-link to recover. See nbtree/README.

1780 *

1781 * On entry, the target buffer must be pinned and locked (either read or write

1782 * lock is OK). The page must be an empty leaf page, which may be half-dead

1783 * already (a half-dead page should only be passed to us when an earlier

1784 * VACUUM operation was interrupted, though). Note in particular that caller

1785 * should never pass a buffer containing an existing deleted page here. The

1786 * lock and pin on caller's buffer will be dropped before we return.

1787 *

1788 * Maintains bulk delete stats for caller, which are taken from vstate. We

1789 * need to cooperate closely with caller here so that whole VACUUM operation

1790 * reliably avoids any double counting of subsidiary-to-leafbuf pages that we

1791 * delete in passing. If such pages happen to be from a block number that is

1792 * ahead of the current scanblkno position, then caller is expected to count

1793 * them directly later on. It's simpler for us to understand caller's

1794 * requirements than it would be for caller to understand when or how a

1795 * deleted page became deleted after the fact.

1796 *

1797 * NOTE: this leaks memory. Rather than trying to clean up everything

1798 * carefully, it's better to run it in a temp context that can be reset

1799 * frequently.

1800 */

1801void

1802 _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)

1803{

1804 BlockNumber rightsib;

1805 bool rightsib_empty;

1806 Page page;

1807 BTPageOpaque opaque;

1808

1809 /*

1810 * Save original leafbuf block number from caller. Only deleted blocks

1811 * that are <= scanblkno are added to bulk delete stat's pages_deleted

1812 * count.

1813 */

1814 BlockNumber scanblkno = BufferGetBlockNumber(leafbuf);

1815

1816 /*

1817 * "stack" is a search stack leading (approximately) to the target page.

1818 * It is initially NULL, but when iterating, we keep it to avoid

1819 * duplicated search effort.

1820 *

1821 * Also, when "stack" is not NULL, we have already checked that the

1822 * current page is not the right half of an incomplete split, i.e. the

1823 * left sibling does not have its INCOMPLETE_SPLIT flag set, including

1824 * when the current target page is to the right of caller's initial page

1825 * (the scanblkno page).

1826 */

1827 BTStack stack = NULL;

1828

1829 for (;;)

1830 {

1831 page = BufferGetPage(leafbuf);

1832 opaque = BTPageGetOpaque(page);

1833

1834 /*

1835 * Internal pages are never deleted directly, only as part of deleting

1836 * the whole subtree all the way down to leaf level.

1837 *

1838 * Also check for deleted pages here. Caller never passes us a fully

1839 * deleted page. Only VACUUM can delete pages, so there can't have

1840 * been a concurrent deletion. Assume that we reached any deleted

1841 * page encountered here by following a sibling link, and that the

1842 * index is corrupt.

1843 */

1844 Assert(!P_ISDELETED(opaque));

1845 if (!P_ISLEAF(opaque) || P_ISDELETED(opaque))

1846 {

1847 /*

1848 * Pre-9.4 page deletion only marked internal pages as half-dead,

1849 * but now we only use that flag on leaf pages. The old algorithm

1850 * was never supposed to leave half-dead pages in the tree, it was

1851 * just a transient state, but it was nevertheless possible in

1852 * error scenarios. We don't know how to deal with them here. They

1853 * are harmless as far as searches are considered, but inserts

1854 * into the deleted keyspace could add out-of-order downlinks in

1855 * the upper levels. Log a notice, hopefully the admin will notice

1856 * and reindex.

1857 */

1858 if (P_ISHALFDEAD(opaque))

1859 ereport(LOG,

1860 (errcode(ERRCODE_INDEX_CORRUPTED),

1861 errmsg("index \"%s\" contains a half-dead internal page",

1862 RelationGetRelationName(rel)),

1863 errhint("This can be caused by an interrupted VACUUM in version 9.3 or older, before upgrade. Please REINDEX it.")));

1864

1865 if (P_ISDELETED(opaque))

1866 ereport(LOG,

1867 (errcode(ERRCODE_INDEX_CORRUPTED),

1868 errmsg_internal("found deleted block %u while following right link from block %u in index \"%s\"",

1869 BufferGetBlockNumber(leafbuf),

1870 scanblkno,

1871 RelationGetRelationName(rel))));

1872

1873 _bt_relbuf(rel, leafbuf);

1874 return;

1875 }

1876

1877 /*

1878 * We can never delete rightmost pages nor root pages. While at it,

1879 * check that page is empty, since it's possible that the leafbuf page

1880 * was empty a moment ago, but has since had some inserts.

1881 *

1882 * To keep the algorithm simple, we also never delete an incompletely

1883 * split page (they should be rare enough that this doesn't make any

1884 * meaningful difference to disk usage):

1885 *

1886 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the

1887 * left half of an incomplete split, but ensuring that it's not the

1888 * right half is more complicated. For that, we have to check that

1889 * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using

1890 * _bt_leftsib_splitflag(). On the first iteration, we temporarily

1891 * release the lock on scanblkno/leafbuf, check the left sibling, and

1892 * construct a search stack to scanblkno. On subsequent iterations,

1893 * we know we stepped right from a page that passed these tests, so

1894 * it's OK.

1895 */

1896 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||

1897 P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||

1898 P_INCOMPLETE_SPLIT(opaque))

1899 {

1900 /* Should never fail to delete a half-dead page */

1901 Assert(!P_ISHALFDEAD(opaque));

1902

1903 _bt_relbuf(rel, leafbuf);

1904 return;

1905 }

1906

1907 /*

1908 * First, remove downlink pointing to the page (or a parent of the

1909 * page, if we are going to delete a taller subtree), and mark the

1910 * leafbuf page half-dead

1911 */

1912 if (!P_ISHALFDEAD(opaque))

1913 {

1914 /*

1915 * We need an approximate pointer to the page's parent page. We

1916 * use a variant of the standard search mechanism to search for

1917 * the page's high key; this will give us a link to either the

1918 * current parent or someplace to its left (if there are multiple

1919 * equal high keys, which is possible with !heapkeyspace indexes).

1920 *

1921 * Also check if this is the right-half of an incomplete split

1922 * (see comment above).

1923 */

1924 if (!stack)

1925 {

1926 BTScanInsert itup_key;

1927 ItemId itemid;

1928 IndexTuple targetkey;

1929 BlockNumber leftsib,

1930 leafblkno;

1931 Buffer sleafbuf;

1932

1933 itemid = PageGetItemId(page, P_HIKEY);

1934 targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));

1935

1936 leftsib = opaque->btpo_prev;

1937 leafblkno = BufferGetBlockNumber(leafbuf);

1938

1939 /*

1940 * To avoid deadlocks, we'd better drop the leaf page lock

1941 * before going further.

1942 */

1943 _bt_unlockbuf(rel, leafbuf);

1944

1945 /*

1946 * Check that the left sibling of leafbuf (if any) is not

1947 * marked with INCOMPLETE_SPLIT flag before proceeding

1948 */

1949 Assert(leafblkno == scanblkno);

1950 if (_bt_leftsib_splitflag(rel, leftsib, leafblkno))

1951 {

1952 ReleaseBuffer(leafbuf);

1953 return;

1954 }

1955

1956 /*

1957 * We need an insertion scan key, so build one.

1958 *

1959 * _bt_search searches for the leaf page that contains any

1960 * matching non-pivot tuples, but we need it to "search" for

1961 * the high key pivot from the page that we're set to delete.

1962 * Compensate for the mismatch by having _bt_search locate the

1963 * last position < equal-to-untruncated-prefix non-pivots.

1964 */

1965 itup_key = _bt_mkscankey(rel, targetkey);

1966

1967 /* Set up a BTLessStrategyNumber-like insertion scan key */

1968 itup_key->nextkey = false;

1969 itup_key->backward = true;

1970 stack = _bt_search(rel, NULL, itup_key, &sleafbuf, BT_READ);

1971 /* won't need a second lock or pin on leafbuf */

1972 _bt_relbuf(rel, sleafbuf);

1973

1974 /*

1975 * Re-lock the leaf page, and start over to use our stack

1976 * within _bt_mark_page_halfdead. We must do it that way

1977 * because it's possible that leafbuf can no longer be

1978 * deleted. We need to recheck.

1979 *

1980 * Note: We can't simply hold on to the sleafbuf lock instead,

1981 * because it's barely possible that sleafbuf is not the same

1982 * page as leafbuf. This happens when leafbuf split after our

1983 * original lock was dropped, but before _bt_search finished

1984 * its descent. We rely on the assumption that we'll find

1985 * leafbuf isn't safe to delete anymore in this scenario.

1986 * (Page deletion can cope with the stack being to the left of

1987 * leafbuf, but not to the right of leafbuf.)

1988 */

1989 _bt_lockbuf(rel, leafbuf, BT_WRITE);

1990 continue;

1991 }

1992

1993 /*

1994 * See if it's safe to delete the leaf page, and determine how

1995 * many parent/internal pages above the leaf level will be

1996 * deleted. If it's safe then _bt_mark_page_halfdead will also

1997 * perform the first phase of deletion, which includes marking the

1998 * leafbuf page half-dead.

1999 */

2000 Assert(P_ISLEAF(opaque) && !P_IGNORE(opaque));

2001 if (!_bt_mark_page_halfdead(rel, vstate->info->heaprel, leafbuf,

2002 stack))

2003 {

2004 _bt_relbuf(rel, leafbuf);

2005 return;

2006 }

2007 }

2008

2009 /*

2010 * Then unlink it from its siblings. Each call to

2011 * _bt_unlink_halfdead_page unlinks the topmost page from the subtree,

2012 * making it shallower. Iterate until the leafbuf page is deleted.

2013 */

2014 rightsib_empty = false;

2015 Assert(P_ISLEAF(opaque) && P_ISHALFDEAD(opaque));

2016 while (P_ISHALFDEAD(opaque))

2017 {

2018 /* Check for interrupts in _bt_unlink_halfdead_page */

2019 if (!_bt_unlink_halfdead_page(rel, leafbuf, scanblkno,

2020 &rightsib_empty, vstate))

2021 {

2022 /*

2023 * _bt_unlink_halfdead_page should never fail, since we

2024 * established that deletion is generally safe in

2025 * _bt_mark_page_halfdead -- index must be corrupt.

2026 *

2027 * Note that _bt_unlink_halfdead_page already released the

2028 * lock and pin on leafbuf for us.

2029 */

2030 Assert(false);

2031 return;

2032 }

2033 }

2034

2035 Assert(P_ISLEAF(opaque) && P_ISDELETED(opaque));

2036

2037 rightsib = opaque->btpo_next;

2038

2039 _bt_relbuf(rel, leafbuf);

2040

2041 /*

2042 * Check here, as calling loops will have locks held, preventing

2043 * interrupts from being processed.

2044 */

2045 CHECK_FOR_INTERRUPTS();

2046

2047 /*

2048 * The page has now been deleted. If its right sibling is completely

2049 * empty, it's possible that the reason we haven't deleted it earlier

2050 * is that it was the rightmost child of the parent. Now that we

2051 * removed the downlink for this page, the right sibling might now be

2052 * the only child of the parent, and could be removed. It would be

2053 * picked up by the next vacuum anyway, but might as well try to

2054 * remove it now, so loop back to process the right sibling.

2055 *

2056 * Note: This relies on the assumption that _bt_getstackbuf() will be

2057 * able to reuse our original descent stack with a different child

2058 * block (provided that the child block is to the right of the

2059 * original leaf page reached by _bt_search()). It will even update

2060 * the descent stack each time we loop around, avoiding repeated work.

2061 */

2062 if (!rightsib_empty)

2063 break;

2064

2065 leafbuf = _bt_getbuf(rel, rightsib, BT_WRITE);

2066 }

2067}

2068

2069/*

2070 * First stage of page deletion.

2071 *

2072 * Establish the height of the to-be-deleted subtree with leafbuf at its

2073 * lowest level, remove the downlink to the subtree, and mark leafbuf

2074 * half-dead. The final to-be-deleted subtree is usually just leafbuf itself,

2075 * but may include additional internal pages (at most one per level of the

2076 * tree below the root).

2077 *

2078 * Caller must pass a valid heaprel, since it's just about possible that our

2079 * call to _bt_lock_subtree_parent will need to allocate a new index page to

2080 * complete a page split. Every call to _bt_allocbuf needs to pass a heaprel.

2081 *

2082 * Returns 'false' if leafbuf is unsafe to delete, usually because leafbuf is

2083 * the rightmost child of its parent (and parent has more than one downlink).

2084 * Returns 'true' when the first stage of page deletion completed

2085 * successfully.

2086 */

2087static bool

2088 _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf,

2089 BTStack stack)

2090{

2091 BlockNumber leafblkno;

2092 BlockNumber leafrightsib;

2093 BlockNumber topparent;

2094 BlockNumber topparentrightsib;

2095 ItemId itemid;

2096 Page page;

2097 BTPageOpaque opaque;

2098 Buffer subtreeparent;

2099 OffsetNumber poffset;

2100 OffsetNumber nextoffset;

2101 IndexTuple itup;

2102 IndexTupleData trunctuple;

2103

2104 page = BufferGetPage(leafbuf);

2105 opaque = BTPageGetOpaque(page);

2106

2107 Assert(!P_RIGHTMOST(opaque) && !P_ISROOT(opaque) &&

2108 P_ISLEAF(opaque) && !P_IGNORE(opaque) &&

2109 P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));

2110 Assert(heaprel != NULL);

2111

2112 /*

2113 * Save info about the leaf page.

2114 */

2115 leafblkno = BufferGetBlockNumber(leafbuf);

2116 leafrightsib = opaque->btpo_next;

2117

2118 /*

2119 * Before attempting to lock the parent page, check that the right sibling

2120 * is not in half-dead state. A half-dead right sibling would have no

2121 * downlink in the parent, which would be highly confusing later when we

2122 * delete the downlink. It would fail the "right sibling of target page

2123 * is also the next child in parent page" cross-check below.

2124 */

2125 if (_bt_rightsib_halfdeadflag(rel, leafrightsib))

2126 {

2127 elog(DEBUG1, "could not delete page %u because its right sibling %u is half-dead",

2128 leafblkno, leafrightsib);

2129 return false;

2130 }

2131

2132 /*

2133 * We cannot delete a page that is the rightmost child of its immediate

2134 * parent, unless it is the only child --- in which case the parent has to

2135 * be deleted too, and the same condition applies recursively to it. We

2136 * have to check this condition all the way up before trying to delete,

2137 * and lock the parent of the root of the to-be-deleted subtree (the

2138 * "subtree parent"). _bt_lock_subtree_parent() locks the subtree parent

2139 * for us. We remove the downlink to the "top parent" page (subtree root

2140 * page) from the subtree parent page below.

2141 *

2142 * Initialize topparent to be leafbuf page now. The final to-be-deleted

2143 * subtree is often a degenerate one page subtree consisting only of the

2144 * leafbuf page. When that happens, the leafbuf page is the final subtree

2145 * root page/top parent page.

2146 */

2147 topparent = leafblkno;

2148 topparentrightsib = leafrightsib;

2149 if (!_bt_lock_subtree_parent(rel, heaprel, leafblkno, stack,

2150 &subtreeparent, &poffset,

2151 &topparent, &topparentrightsib))

2152 return false;

2153

2154 page = BufferGetPage(subtreeparent);

2155 opaque = BTPageGetOpaque(page);

2156

2157#ifdef USE_ASSERT_CHECKING

2158

2159 /*

2160 * This is just an assertion because _bt_lock_subtree_parent should have

2161 * guaranteed tuple has the expected contents

2162 */

2163 itemid = PageGetItemId(page, poffset);

2164 itup = (IndexTuple) PageGetItem(page, itemid);

2165 Assert(BTreeTupleGetDownLink(itup) == topparent);

2166#endif

2167

2168 nextoffset = OffsetNumberNext(poffset);

2169 itemid = PageGetItemId(page, nextoffset);

2170 itup = (IndexTuple) PageGetItem(page, itemid);

2171

2172 /*

2173 * Check that the parent-page index items we're about to delete/overwrite

2174 * in subtree parent page contain what we expect. This can fail if the

2175 * index has become corrupt for some reason. When that happens we back

2176 * out of deletion of the leafbuf subtree. (This is just like the case

2177 * where _bt_lock_subtree_parent() cannot "re-find" leafbuf's downlink.)

2178 */

2179 if (BTreeTupleGetDownLink(itup) != topparentrightsib)

2180 {

2181 ereport(LOG,

2182 (errcode(ERRCODE_INDEX_CORRUPTED),

2183 errmsg_internal("right sibling %u of block %u is not next child %u of block %u in index \"%s\"",

2184 topparentrightsib, topparent,

2185 BTreeTupleGetDownLink(itup),

2186 BufferGetBlockNumber(subtreeparent),

2187 RelationGetRelationName(rel))));

2188

2189 _bt_relbuf(rel, subtreeparent);

2190 Assert(false);

2191 return false;

2192 }

2193

2194 /*

2195 * Any insert which would have gone on the leaf block will now go to its

2196 * right sibling. In other words, the key space moves right.

2197 */

2198 PredicateLockPageCombine(rel, leafblkno, leafrightsib);

2199

2200 /* No ereport(ERROR) until changes are logged */

2201 START_CRIT_SECTION();

2202

2203 /*

2204 * Update parent of subtree. We want to delete the downlink to the top

2205 * parent page/root of the subtree, and the *following* key. Easiest way

2206 * is to copy the right sibling's downlink over the downlink that points

2207 * to top parent page, and then delete the right sibling's original pivot

2208 * tuple.

2209 *

2210 * Lanin and Shasha make the key space move left when deleting a page,

2211 * whereas the key space moves right here. That's why we cannot simply

2212 * delete the pivot tuple with the downlink to the top parent page. See

2213 * nbtree/README.

2214 */

2215 page = BufferGetPage(subtreeparent);

2216 opaque = BTPageGetOpaque(page);

2217

2218 itemid = PageGetItemId(page, poffset);

2219 itup = (IndexTuple) PageGetItem(page, itemid);

2220 BTreeTupleSetDownLink(itup, topparentrightsib);

2221

2222 nextoffset = OffsetNumberNext(poffset);

2223 PageIndexTupleDelete(page, nextoffset);

2224

2225 /*

2226 * Mark the leaf page as half-dead, and stamp it with a link to the top

2227 * parent page. When the leaf page is also the top parent page, the link

2228 * is set to InvalidBlockNumber.

2229 */

2230 page = BufferGetPage(leafbuf);

2231 opaque = BTPageGetOpaque(page);

2232 opaque->btpo_flags |= BTP_HALF_DEAD;

2233

2234 Assert(PageGetMaxOffsetNumber(page) == P_HIKEY);

2235 MemSet(&trunctuple, 0, sizeof(IndexTupleData));

2236 trunctuple.t_info = sizeof(IndexTupleData);

2237 if (topparent != leafblkno)

2238 BTreeTupleSetTopParent(&trunctuple, topparent);

2239 else

2240 BTreeTupleSetTopParent(&trunctuple, InvalidBlockNumber);

2241

2242 if (!PageIndexTupleOverwrite(page, P_HIKEY, (Item) &trunctuple,

2243 IndexTupleSize(&trunctuple)))

2244 elog(ERROR, "could not overwrite high key in half-dead page");

2245

2246 /* Must mark buffers dirty before XLogInsert */

2247 MarkBufferDirty(subtreeparent);

2248 MarkBufferDirty(leafbuf);

2249

2250 /* XLOG stuff */

2251 if (RelationNeedsWAL(rel))

2252 {

2253 xl_btree_mark_page_halfdead xlrec;

2254 XLogRecPtr recptr;

2255

2256 xlrec.poffset = poffset;

2257 xlrec.leafblk = leafblkno;

2258 if (topparent != leafblkno)

2259 xlrec.topparent = topparent;

2260 else

2261 xlrec.topparent = InvalidBlockNumber;

2262

2263 XLogBeginInsert();

2264 XLogRegisterBuffer(0, leafbuf, REGBUF_WILL_INIT);

2265 XLogRegisterBuffer(1, subtreeparent, REGBUF_STANDARD);

2266

2267 page = BufferGetPage(leafbuf);

2268 opaque = BTPageGetOpaque(page);

2269 xlrec.leftblk = opaque->btpo_prev;

2270 xlrec.rightblk = opaque->btpo_next;

2271

2272 XLogRegisterData(&xlrec, SizeOfBtreeMarkPageHalfDead);

2273

2274 recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_MARK_PAGE_HALFDEAD);

2275

2276 page = BufferGetPage(subtreeparent);

2277 PageSetLSN(page, recptr);

2278 page = BufferGetPage(leafbuf);

2279 PageSetLSN(page, recptr);

2280 }

2281

2282 END_CRIT_SECTION();

2283

2284 _bt_relbuf(rel, subtreeparent);

2285 return true;

2286}

2287

2288/*

2289 * Second stage of page deletion.

2290 *

2291 * Unlinks a single page (in the subtree undergoing deletion) from its

2292 * siblings. Also marks the page deleted.

2293 *

2294 * To get rid of the whole subtree, including the leaf page itself, call here

2295 * until the leaf page is deleted. The original "top parent" established in

2296 * the first stage of deletion is deleted in the first call here, while the

2297 * leaf page is deleted in the last call here. Note that the leaf page itself

2298 * is often the initial top parent page.

2299 *

2300 * Returns 'false' if the page could not be unlinked (shouldn't happen). If

2301 * the right sibling of the current target page is empty, *rightsib_empty is

2302 * set to true, allowing caller to delete the target's right sibling page in

2303 * passing. Note that *rightsib_empty is only actually used by caller when

2304 * target page is leafbuf, following last call here for leafbuf/the subtree

2305 * containing leafbuf. (We always set *rightsib_empty for caller, just to be

2306 * consistent.)

2307 *

2308 * Must hold pin and lock on leafbuf at entry (read or write doesn't matter).

2309 * On success exit, we'll be holding pin and write lock. On failure exit,

2310 * we'll release both pin and lock before returning (we define it that way

2311 * to avoid having to reacquire a lock we already released).

2312 */

2313static bool

2314 _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,

2315 bool *rightsib_empty, BTVacState *vstate)

2316{

2317 BlockNumber leafblkno = BufferGetBlockNumber(leafbuf);

2318 IndexBulkDeleteResult *stats = vstate->stats;

2319 BlockNumber leafleftsib;

2320 BlockNumber leafrightsib;

2321 BlockNumber target;

2322 BlockNumber leftsib;

2323 BlockNumber rightsib;

2324 Buffer lbuf = InvalidBuffer;

2325 Buffer buf;

2326 Buffer rbuf;

2327 Buffer metabuf = InvalidBuffer;

2328 Page metapg = NULL;

2329 BTMetaPageData *metad = NULL;

2330 ItemId itemid;

2331 Page page;

2332 BTPageOpaque opaque;

2333 FullTransactionId safexid;

2334 bool rightsib_is_rightmost;

2335 uint32 targetlevel;

2336 IndexTuple leafhikey;

2337 BlockNumber leaftopparent;

2338

2339 page = BufferGetPage(leafbuf);

2340 opaque = BTPageGetOpaque(page);

2341

2342 Assert(P_ISLEAF(opaque) && !P_ISDELETED(opaque) && P_ISHALFDEAD(opaque));

2343

2344 /*

2345 * Remember some information about the leaf page.

2346 */

2347 itemid = PageGetItemId(page, P_HIKEY);

2348 leafhikey = (IndexTuple) PageGetItem(page, itemid);

2349 target = BTreeTupleGetTopParent(leafhikey);

2350 leafleftsib = opaque->btpo_prev;

2351 leafrightsib = opaque->btpo_next;

2352

2353 _bt_unlockbuf(rel, leafbuf);

2354

2355 /*

2356 * Check here, as calling loops will have locks held, preventing

2357 * interrupts from being processed.

2358 */

2359 CHECK_FOR_INTERRUPTS();

2360

2361 /* Unlink the current top parent of the subtree */

2362 if (!BlockNumberIsValid(target))

2363 {

2364 /* Target is leaf page (or leaf page is top parent, if you prefer) */

2365 target = leafblkno;

2366

2367 buf = leafbuf;

2368 leftsib = leafleftsib;

2369 targetlevel = 0;

2370 }

2371 else

2372 {

2373 /* Target is the internal page taken from leaf's top parent link */

2374 Assert(target != leafblkno);

2375

2376 /* Fetch the block number of the target's left sibling */

2377 buf = _bt_getbuf(rel, target, BT_READ);

2378 page = BufferGetPage(buf);

2379 opaque = BTPageGetOpaque(page);

2380 leftsib = opaque->btpo_prev;

2381 targetlevel = opaque->btpo_level;

2382 Assert(targetlevel > 0);

2383

2384 /*

2385 * To avoid deadlocks, we'd better drop the target page lock before

2386 * going further.

2387 */

2388 _bt_unlockbuf(rel, buf);

2389 }

2390

2391 /*

2392 * We have to lock the pages we need to modify in the standard order:

2393 * moving right, then up. Else we will deadlock against other writers.

2394 *

2395 * So, first lock the leaf page, if it's not the target. Then find and

2396 * write-lock the current left sibling of the target page. The sibling

2397 * that was current a moment ago could have split, so we may have to move

2398 * right.

2399 */

2400 if (target != leafblkno)

2401 _bt_lockbuf(rel, leafbuf, BT_WRITE);

2402 if (leftsib != P_NONE)

2403 {

2404 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);

2405 page = BufferGetPage(lbuf);

2406 opaque = BTPageGetOpaque(page);

2407 while (P_ISDELETED(opaque) || opaque->btpo_next != target)

2408 {

2409 bool leftsibvalid = true;

2410

2411 /*

2412 * Before we follow the link from the page that was the left

2413 * sibling mere moments ago, validate its right link. This

2414 * reduces the opportunities for loop to fail to ever make any

2415 * progress in the presence of index corruption.

2416 *

2417 * Note: we rely on the assumption that there can only be one

2418 * vacuum process running at a time (against the same index).

2419 */

2420 if (P_RIGHTMOST(opaque) || P_ISDELETED(opaque) ||

2421 leftsib == opaque->btpo_next)

2422 leftsibvalid = false;

2423

2424 leftsib = opaque->btpo_next;

2425 _bt_relbuf(rel, lbuf);

2426

2427 if (!leftsibvalid)

2428 {

2429 /*

2430 * This is known to fail in the field; sibling link corruption

2431 * is relatively common. Press on with vacuuming rather than

2432 * just throwing an ERROR.

2433 */

2434 ereport(LOG,

2435 (errcode(ERRCODE_INDEX_CORRUPTED),

2436 errmsg_internal("valid left sibling for deletion target could not be located: "

2437 "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"",

2438 leftsib, target, leafblkno, scanblkno,

2439 targetlevel, RelationGetRelationName(rel))));

2440

2441 /* Must release all pins and locks on failure exit */

2442 ReleaseBuffer(buf);

2443 if (target != leafblkno)

2444 _bt_relbuf(rel, leafbuf);

2445

2446 return false;

2447 }

2448

2449 CHECK_FOR_INTERRUPTS();

2450

2451 /* step right one page */

2452 lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);

2453 page = BufferGetPage(lbuf);

2454 opaque = BTPageGetOpaque(page);

2455 }

2456 }

2457 else

2458 lbuf = InvalidBuffer;

2459

2460 /* Next write-lock the target page itself */

2461 _bt_lockbuf(rel, buf, BT_WRITE);

2462 page = BufferGetPage(buf);

2463 opaque = BTPageGetOpaque(page);

2464

2465 /*

2466 * Check page is still empty etc, else abandon deletion. This is just for

2467 * paranoia's sake; a half-dead page cannot resurrect because there can be

2468 * only one vacuum process running at a time.

2469 */

2470 if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque))

2471 elog(ERROR, "target page changed status unexpectedly in block %u of index \"%s\"",

2472 target, RelationGetRelationName(rel));

2473

2474 if (opaque->btpo_prev != leftsib)

2475 ereport(ERROR,

2476 (errcode(ERRCODE_INDEX_CORRUPTED),

2477 errmsg_internal("target page left link unexpectedly changed from %u to %u in block %u of index \"%s\"",

2478 leftsib, opaque->btpo_prev, target,

2479 RelationGetRelationName(rel))));

2480

2481 if (target == leafblkno)

2482 {

2483 if (P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||

2484 !P_ISLEAF(opaque) || !P_ISHALFDEAD(opaque))

2485 elog(ERROR, "target leaf page changed status unexpectedly in block %u of index \"%s\"",

2486 target, RelationGetRelationName(rel));

2487

2488 /* Leaf page is also target page: don't set leaftopparent */

2489 leaftopparent = InvalidBlockNumber;

2490 }

2491 else

2492 {

2493 IndexTuple finaldataitem;

2494

2495 if (P_FIRSTDATAKEY(opaque) != PageGetMaxOffsetNumber(page) ||

2496 P_ISLEAF(opaque))

2497 elog(ERROR, "target internal page on level %u changed status unexpectedly in block %u of index \"%s\"",

2498 targetlevel, target, RelationGetRelationName(rel));

2499

2500 /* Target is internal: set leaftopparent for next call here... */

2501 itemid = PageGetItemId(page, P_FIRSTDATAKEY(opaque));

2502 finaldataitem = (IndexTuple) PageGetItem(page, itemid);

2503 leaftopparent = BTreeTupleGetDownLink(finaldataitem);

2504 /* ...except when it would be a redundant pointer-to-self */

2505 if (leaftopparent == leafblkno)

2506 leaftopparent = InvalidBlockNumber;

2507 }

2508

2509 /* No leaftopparent for level 0 (leaf page) or level 1 target */

2510 Assert(!BlockNumberIsValid(leaftopparent) || targetlevel > 1);

2511

2512 /*

2513 * And next write-lock the (current) right sibling.

2514 */

2515 rightsib = opaque->btpo_next;

2516 rbuf = _bt_getbuf(rel, rightsib, BT_WRITE);

2517 page = BufferGetPage(rbuf);

2518 opaque = BTPageGetOpaque(page);

2519

2520 /*

2521 * Validate target's right sibling page. Its left link must point back to

2522 * the target page.

2523 */

2524 if (opaque->btpo_prev != target)

2525 {

2526 /*

2527 * This is known to fail in the field; sibling link corruption is

2528 * relatively common. Press on with vacuuming rather than just

2529 * throwing an ERROR (same approach used for left-sibling's-right-link

2530 * validation check a moment ago).

2531 */

2532 ereport(LOG,

2533 (errcode(ERRCODE_INDEX_CORRUPTED),

2534 errmsg_internal("right sibling's left-link doesn't match: "

2535 "right sibling %u of target %u with leafblkno %u "

2536 "and scanblkno %u spuriously links to non-target %u "

2537 "on level %u of index \"%s\"",

2538 rightsib, target, leafblkno,

2539 scanblkno, opaque->btpo_prev,

2540 targetlevel, RelationGetRelationName(rel))));

2541

2542 /* Must release all pins and locks on failure exit */

2543 if (BufferIsValid(lbuf))

2544 _bt_relbuf(rel, lbuf);

2545 _bt_relbuf(rel, rbuf);

2546 _bt_relbuf(rel, buf);

2547 if (target != leafblkno)

2548 _bt_relbuf(rel, leafbuf);

2549

2550 return false;

2551 }

2552

2553 rightsib_is_rightmost = P_RIGHTMOST(opaque);

2554 *rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));

2555

2556 /*

2557 * If we are deleting the next-to-last page on the target's level, then

2558 * the rightsib is a candidate to become the new fast root. (In theory, it

2559 * might be possible to push the fast root even further down, but the odds

2560 * of doing so are slim, and the locking considerations daunting.)

2561 *

2562 * We can safely acquire a lock on the metapage here --- see comments for

2563 * _bt_newlevel().

2564 */

2565 if (leftsib == P_NONE && rightsib_is_rightmost)

2566 {

2567 page = BufferGetPage(rbuf);

2568 opaque = BTPageGetOpaque(page);

2569 if (P_RIGHTMOST(opaque))

2570 {

2571 /* rightsib will be the only one left on the level */

2572 metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);

2573 metapg = BufferGetPage(metabuf);

2574 metad = BTPageGetMeta(metapg);

2575

2576 /*

2577 * The expected case here is btm_fastlevel == targetlevel+1; if

2578 * the fastlevel is <= targetlevel, something is wrong, and we

2579 * choose to overwrite it to fix it.

2580 */

2581 if (metad->btm_fastlevel > targetlevel + 1)

2582 {

2583 /* no update wanted */

2584 _bt_relbuf(rel, metabuf);

2585 metabuf = InvalidBuffer;

2586 }

2587 }

2588 }

2589

2590 /*

2591 * Here we begin doing the deletion.

2592 */

2593

2594 /* No ereport(ERROR) until changes are logged */

2595 START_CRIT_SECTION();

2596

2597 /*

2598 * Update siblings' side-links. Note the target page's side-links will

2599 * continue to point to the siblings. Asserts here are just rechecking

2600 * things we already verified above.

2601 */

2602 if (BufferIsValid(lbuf))

2603 {

2604 page = BufferGetPage(lbuf);

2605 opaque = BTPageGetOpaque(page);

2606 Assert(opaque->btpo_next == target);

2607 opaque->btpo_next = rightsib;

2608 }

2609 page = BufferGetPage(rbuf);

2610 opaque = BTPageGetOpaque(page);

2611 Assert(opaque->btpo_prev == target);

2612 opaque->btpo_prev = leftsib;

2613

2614 /*

2615 * If we deleted a parent of the targeted leaf page, instead of the leaf

2616 * itself, update the leaf to point to the next remaining child in the

2617 * subtree.

2618 *

2619 * Note: We rely on the fact that a buffer pin on the leaf page has been

2620 * held since leafhikey was initialized. This is safe, though only

2621 * because the page was already half-dead at that point. The leaf page

2622 * cannot have been modified by any other backend during the period when

2623 * no lock was held.

2624 */

2625 if (target != leafblkno)

2626 BTreeTupleSetTopParent(leafhikey, leaftopparent);

2627

2628 /*

2629 * Mark the page itself deleted. It can be recycled when all current

2630 * transactions are gone. Storing GetTopTransactionId() would work, but

2631 * we're in VACUUM and would not otherwise have an XID. Having already

2632 * updated links to the target, ReadNextFullTransactionId() suffices as an

2633 * upper bound. Any scan having retained a now-stale link is advertising

2634 * in its PGPROC an xmin less than or equal to the value we read here. It

2635 * will continue to do so, holding back the xmin horizon, for the duration

2636 * of that scan.

2637 */

2638 page = BufferGetPage(buf);

2639 opaque = BTPageGetOpaque(page);

2640 Assert(P_ISHALFDEAD(opaque) || !P_ISLEAF(opaque));

2641

2642 /*

2643 * Store upper bound XID that's used to determine when deleted page is no

2644 * longer needed as a tombstone

2645 */

2646 safexid = ReadNextFullTransactionId();

2647 BTPageSetDeleted(page, safexid);

2648 opaque->btpo_cycleid = 0;

2649

2650 /* And update the metapage, if needed */

2651 if (BufferIsValid(metabuf))

2652 {

2653 /* upgrade metapage if needed */

2654 if (metad->btm_version < BTREE_NOVAC_VERSION)

2655 _bt_upgrademetapage(metapg);

2656 metad->btm_fastroot = rightsib;

2657 metad->btm_fastlevel = targetlevel;

2658 MarkBufferDirty(metabuf);

2659 }

2660

2661 /* Must mark buffers dirty before XLogInsert */

2662 MarkBufferDirty(rbuf);

2663 MarkBufferDirty(buf);

2664 if (BufferIsValid(lbuf))

2665 MarkBufferDirty(lbuf);

2666 if (target != leafblkno)

2667 MarkBufferDirty(leafbuf);

2668

2669 /* XLOG stuff */

2670 if (RelationNeedsWAL(rel))

2671 {

2672 xl_btree_unlink_page xlrec;

2673 xl_btree_metadata xlmeta;

2674 uint8 xlinfo;

2675 XLogRecPtr recptr;

2676

2677 XLogBeginInsert();

2678

2679 XLogRegisterBuffer(0, buf, REGBUF_WILL_INIT);

2680 if (BufferIsValid(lbuf))

2681 XLogRegisterBuffer(1, lbuf, REGBUF_STANDARD);

2682 XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD);

2683 if (target != leafblkno)

2684 XLogRegisterBuffer(3, leafbuf, REGBUF_WILL_INIT);

2685

2686 /* information stored on the target/to-be-unlinked block */

2687 xlrec.leftsib = leftsib;

2688 xlrec.rightsib = rightsib;

2689 xlrec.level = targetlevel;

2690 xlrec.safexid = safexid;

2691

2692 /* information needed to recreate the leaf block (if not the target) */

2693 xlrec.leafleftsib = leafleftsib;

2694 xlrec.leafrightsib = leafrightsib;

2695 xlrec.leaftopparent = leaftopparent;

2696

2697 XLogRegisterData(&xlrec, SizeOfBtreeUnlinkPage);

2698

2699 if (BufferIsValid(metabuf))

2700 {

2701 XLogRegisterBuffer(4, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);

2702

2703 Assert(metad->btm_version >= BTREE_NOVAC_VERSION);

2704 xlmeta.version = metad->btm_version;

2705 xlmeta.root = metad->btm_root;

2706 xlmeta.level = metad->btm_level;

2707 xlmeta.fastroot = metad->btm_fastroot;

2708 xlmeta.fastlevel = metad->btm_fastlevel;

2709 xlmeta.last_cleanup_num_delpages = metad->btm_last_cleanup_num_delpages;

2710 xlmeta.allequalimage = metad->btm_allequalimage;

2711

2712 XLogRegisterBufData(4, &xlmeta, sizeof(xl_btree_metadata));

2713 xlinfo = XLOG_BTREE_UNLINK_PAGE_META;

2714 }

2715 else

2716 xlinfo = XLOG_BTREE_UNLINK_PAGE;

2717

2718 recptr = XLogInsert(RM_BTREE_ID, xlinfo);

2719

2720 if (BufferIsValid(metabuf))

2721 {

2722 PageSetLSN(metapg, recptr);

2723 }

2724 page = BufferGetPage(rbuf);

2725 PageSetLSN(page, recptr);

2726 page = BufferGetPage(buf);

2727 PageSetLSN(page, recptr);

2728 if (BufferIsValid(lbuf))

2729 {

2730 page = BufferGetPage(lbuf);

2731 PageSetLSN(page, recptr);

2732 }

2733 if (target != leafblkno)

2734 {

2735 page = BufferGetPage(leafbuf);

2736 PageSetLSN(page, recptr);

2737 }

2738 }

2739

2740 END_CRIT_SECTION();

2741

2742 /* release metapage */

2743 if (BufferIsValid(metabuf))

2744 _bt_relbuf(rel, metabuf);

2745

2746 /* release siblings */

2747 if (BufferIsValid(lbuf))

2748 _bt_relbuf(rel, lbuf);

2749 _bt_relbuf(rel, rbuf);

2750

2751 /* If the target is not leafbuf, we're done with it now -- release it */

2752 if (target != leafblkno)

2753 _bt_relbuf(rel, buf);

2754

2755 /*

2756 * Maintain pages_newly_deleted, which is simply the number of pages

2757 * deleted by the ongoing VACUUM operation.

2758 *

2759 * Maintain pages_deleted in a way that takes into account how

2760 * btvacuumpage() will count deleted pages that have yet to become

2761 * scanblkno -- only count page when it's not going to get that treatment

2762 * later on.

2763 */

2764 stats->pages_newly_deleted++;

2765 if (target <= scanblkno)

2766 stats->pages_deleted++;

2767

2768 /*

2769 * Remember information about the target page (now a newly deleted page)

2770 * in dedicated vstate space for later. The page will be considered as a

2771 * candidate to place in the FSM at the end of the current btvacuumscan()

2772 * call.

2773 */

2774 _bt_pendingfsm_add(vstate, target, safexid);

2775

2776 /* Success - hold on to lock on leafbuf (might also have been target) */

2777 return true;

2778}

2779

2780/*

2781 * Establish how tall the to-be-deleted subtree will be during the first stage

2782 * of page deletion.

2783 *

2784 * Caller's child argument is the block number of the page caller wants to

2785 * delete (this is leafbuf's block number, except when we're called

2786 * recursively). stack is a search stack leading to it. Note that we will

2787 * update the stack entry(s) to reflect current downlink positions --- this is

2788 * similar to the corresponding point in page split handling.

2789 *

2790 * If "first stage" caller cannot go ahead with deleting _any_ pages, returns

2791 * false. Returns true on success, in which case caller can use certain

2792 * details established here to perform the first stage of deletion. This

2793 * function is the last point at which page deletion may be deemed unsafe

2794 * (barring index corruption, or unexpected concurrent page deletions).

2795 *

2796 * We write lock the parent of the root of the to-be-deleted subtree for

2797 * caller on success (i.e. we leave our lock on the *subtreeparent buffer for

2798 * caller). Caller will have to remove a downlink from *subtreeparent. We

2799 * also set a *subtreeparent offset number in *poffset, to indicate the

2800 * location of the pivot tuple that contains the relevant downlink.

2801 *

2802 * The root of the to-be-deleted subtree is called the "top parent". Note

2803 * that the leafbuf page is often the final "top parent" page (you can think

2804 * of the leafbuf page as a degenerate single page subtree when that happens).

2805 * Caller should initialize *topparent to the target leafbuf page block number

2806 * (while *topparentrightsib should be set to leafbuf's right sibling block

2807 * number). We will update *topparent (and *topparentrightsib) for caller

2808 * here, though only when it turns out that caller will delete at least one

2809 * internal page (i.e. only when caller needs to store a valid link to the top

2810 * parent block in the leafbuf page using BTreeTupleSetTopParent()).

2811 */

2812static bool

2813 _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child,

2814 BTStack stack, Buffer *subtreeparent,

2815 OffsetNumber *poffset, BlockNumber *topparent,

2816 BlockNumber *topparentrightsib)

2817{

2818 BlockNumber parent,

2819 leftsibparent;

2820 OffsetNumber parentoffset,

2821 maxoff;

2822 Buffer pbuf;

2823 Page page;

2824 BTPageOpaque opaque;

2825

2826 /*

2827 * Locate the pivot tuple whose downlink points to "child". Write lock

2828 * the parent page itself.

2829 */

2830 pbuf = _bt_getstackbuf(rel, heaprel, stack, child);

2831 if (pbuf == InvalidBuffer)

2832 {

2833 /*

2834 * Failed to "re-find" a pivot tuple whose downlink matched our child

2835 * block number on the parent level -- the index must be corrupt.

2836 * Don't even try to delete the leafbuf subtree. Just report the

2837 * issue and press on with vacuuming the index.

2838 *

2839 * Note: _bt_getstackbuf() recovers from concurrent page splits that

2840 * take place on the parent level. Its approach is a near-exhaustive

2841 * linear search. This also gives it a surprisingly good chance of

2842 * recovering in the event of a buggy or inconsistent opclass. But we

2843 * don't rely on that here.

2844 */

2845 ereport(LOG,

2846 (errcode(ERRCODE_INDEX_CORRUPTED),

2847 errmsg_internal("failed to re-find parent key in index \"%s\" for deletion target page %u",

2848 RelationGetRelationName(rel), child)));

2849 Assert(false);

2850 return false;

2851 }

2852

2853 parent = stack->bts_blkno;

2854 parentoffset = stack->bts_offset;

2855

2856 page = BufferGetPage(pbuf);

2857 opaque = BTPageGetOpaque(page);

2858 maxoff = PageGetMaxOffsetNumber(page);

2859 leftsibparent = opaque->btpo_prev;

2860

2861 /*

2862 * _bt_getstackbuf() completes page splits on returned parent buffer when

2863 * required.

2864 *

2865 * In general it's a bad idea for VACUUM to use up more disk space, which

2866 * is why page deletion does not finish incomplete page splits most of the

2867 * time. We allow this limited exception because the risk is much lower,

2868 * and the potential downside of not proceeding is much higher: A single

2869 * internal page with the INCOMPLETE_SPLIT flag set might otherwise

2870 * prevent us from deleting hundreds of empty leaf pages from one level

2871 * down.

2872 */

2873 Assert(!P_INCOMPLETE_SPLIT(opaque));

2874

2875 if (parentoffset < maxoff)

2876 {

2877 /*

2878 * Child is not the rightmost child in parent, so it's safe to delete

2879 * the subtree whose root/topparent is child page

2880 */

2881 *subtreeparent = pbuf;

2882 *poffset = parentoffset;

2883 return true;

2884 }

2885

2886 /*

2887 * Child is the rightmost child of parent.

2888 *

2889 * Since it's the rightmost child of parent, deleting the child (or

2890 * deleting the subtree whose root/topparent is the child page) is only

2891 * safe when it's also possible to delete the parent.

2892 */

2893 Assert(parentoffset == maxoff);

2894 if (parentoffset != P_FIRSTDATAKEY(opaque) || P_RIGHTMOST(opaque))

2895 {

2896 /*

2897 * Child isn't parent's only child, or parent is rightmost on its

2898 * entire level. Definitely cannot delete any pages.

2899 */

2900 _bt_relbuf(rel, pbuf);

2901 return false;

2902 }

2903

2904 /*

2905 * Now make sure that the parent deletion is itself safe by examining the

2906 * child's grandparent page. Recurse, passing the parent page as the

2907 * child page (child's grandparent is the parent on the next level up). If

2908 * parent deletion is unsafe, then child deletion must also be unsafe (in

2909 * which case caller cannot delete any pages at all).

2910 */

2911 *topparent = parent;

2912 *topparentrightsib = opaque->btpo_next;

2913

2914 /*

2915 * Release lock on parent before recursing.

2916 *

2917 * It's OK to release page locks on parent before recursive call locks

2918 * grandparent. An internal page can only acquire an entry if the child

2919 * is split, but that cannot happen as long as we still hold a lock on the

2920 * leafbuf page.

2921 */

2922 _bt_relbuf(rel, pbuf);

2923

2924 /*

2925 * Before recursing, check that the left sibling of parent (if any) is not

2926 * marked with INCOMPLETE_SPLIT flag first (must do so after we drop the

2927 * parent lock).

2928 *

2929 * Note: We deliberately avoid completing incomplete splits here.

2930 */

2931 if (_bt_leftsib_splitflag(rel, leftsibparent, parent))

2932 return false;

2933

2934 /* Recurse to examine child page's grandparent page */

2935 return _bt_lock_subtree_parent(rel, heaprel, parent, stack->bts_parent,

2936 subtreeparent, poffset,

2937 topparent, topparentrightsib);

2938}

2939

2940/*

2941 * Initialize local memory state used by VACUUM for _bt_pendingfsm_finalize

2942 * optimization.

2943 *

2944 * Called at the start of a btvacuumscan(). Caller's cleanuponly argument

2945 * indicates if ongoing VACUUM has not (and will not) call btbulkdelete().

2946 *

2947 * We expect to allocate memory inside VACUUM's top-level memory context here.

2948 * The working buffer is subject to a limit based on work_mem. Our strategy

2949 * when the array can no longer grow within the bounds of that limit is to

2950 * stop saving additional newly deleted pages, while proceeding as usual with

2951 * the pages that we can fit.

2952 */

2953void

2954 _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)

2955{

2956 Size maxbufsize;

2957

2958 /*

2959 * Don't bother with optimization in cleanup-only case -- we don't expect

2960 * any newly deleted pages. Besides, cleanup-only calls to btvacuumscan()

2961 * can only take place because this optimization didn't work out during

2962 * the last VACUUM.

2963 */

2964 if (cleanuponly)

2965 return;

2966

2967 /*

2968 * Cap maximum size of array so that we always respect work_mem. Avoid

2969 * int overflow here.

2970 */

2971 vstate->bufsize = 256;

2972 maxbufsize = (work_mem * (Size) 1024) / sizeof(BTPendingFSM);

2973 maxbufsize = Min(maxbufsize, MaxAllocSize / sizeof(BTPendingFSM));

2974 /* BTVacState.maxbufsize has type int */

2975 maxbufsize = Min(maxbufsize, INT_MAX);

2976 /* Stay sane with small work_mem */

2977 maxbufsize = Max(maxbufsize, vstate->bufsize);

2978 vstate->maxbufsize = (int) maxbufsize;

2979

2980 /* Allocate buffer, indicate that there are currently 0 pending pages */

2981 vstate->pendingpages = palloc(sizeof(BTPendingFSM) * vstate->bufsize);

2982 vstate->npendingpages = 0;

2983}

2984

2985/*

2986 * Place any newly deleted pages (i.e. pages that _bt_pagedel() deleted during

2987 * the ongoing VACUUM operation) into the free space map -- though only when

2988 * it is actually safe to do so by now.

2989 *

2990 * Called at the end of a btvacuumscan(), just before free space map vacuuming

2991 * takes place.

2992 *

2993 * Frees memory allocated by _bt_pendingfsm_init(), if any.

2994 */

2995void

2996 _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)

2997{

2998 IndexBulkDeleteResult *stats = vstate->stats;

2999 Relation heaprel = vstate->info->heaprel;

3000

3001 Assert(stats->pages_newly_deleted >= vstate->npendingpages);

3002 Assert(heaprel != NULL);

3003

3004 if (vstate->npendingpages == 0)

3005 {

3006 /* Just free memory when nothing to do */

3007 if (vstate->pendingpages)

3008 pfree(vstate->pendingpages);

3009

3010 return;

3011 }

3012

3013#ifdef DEBUG_BTREE_PENDING_FSM

3014

3015 /*

3016 * Debugging aid: Sleep for 5 seconds to greatly increase the chances of

3017 * placing pending pages in the FSM. Note that the optimization will

3018 * never be effective without some other backend concurrently consuming an

3019 * XID.

3020 */

3021 pg_usleep(5000000L);

3022#endif

3023

3024 /*

3025 * Recompute VACUUM XID boundaries.

3026 *

3027 * We don't actually care about the oldest non-removable XID. Computing

3028 * the oldest such XID has a useful side-effect that we rely on: it

3029 * forcibly updates the XID horizon state for this backend. This step is

3030 * essential; GlobalVisCheckRemovableFullXid() will not reliably recognize

3031 * that it is now safe to recycle newly deleted pages without this step.

3032 */

3033 GetOldestNonRemovableTransactionId(heaprel);

3034

3035 for (int i = 0; i < vstate->npendingpages; i++)

3036 {

3037 BlockNumber target = vstate->pendingpages[i].target;

3038 FullTransactionId safexid = vstate->pendingpages[i].safexid;

3039

3040 /*

3041 * Do the equivalent of checking BTPageIsRecyclable(), but without

3042 * accessing the page again a second time.

3043 *

3044 * Give up on finding the first non-recyclable page -- all later pages

3045 * must be non-recyclable too, since _bt_pendingfsm_add() adds pages

3046 * to the array in safexid order.

3047 */

3048 if (!GlobalVisCheckRemovableFullXid(heaprel, safexid))

3049 break;

3050

3051 RecordFreeIndexPage(rel, target);

3052 stats->pages_free++;

3053 }

3054

3055 pfree(vstate->pendingpages);

3056}

3057

3058/*

3059 * Maintain array of pages that were deleted during current btvacuumscan()

3060 * call, for use in _bt_pendingfsm_finalize()

3061 */

3062static void

3063 _bt_pendingfsm_add(BTVacState *vstate,

3064 BlockNumber target,

3065 FullTransactionId safexid)

3066{

3067 Assert(vstate->npendingpages <= vstate->bufsize);

3068 Assert(vstate->bufsize <= vstate->maxbufsize);

3069

3070#ifdef USE_ASSERT_CHECKING

3071

3072 /*

3073 * Verify an assumption made by _bt_pendingfsm_finalize(): pages from the

3074 * array will always be in safexid order (since that is the order that we

3075 * save them in here)

3076 */

3077 if (vstate->npendingpages > 0)

3078 {

3079 FullTransactionId lastsafexid =

3080 vstate->pendingpages[vstate->npendingpages - 1].safexid;

3081

3082 Assert(FullTransactionIdFollowsOrEquals(safexid, lastsafexid));

3083 }

3084#endif

3085

3086 /*

3087 * If temp buffer reaches maxbufsize/work_mem capacity then we discard

3088 * information about this page.

3089 *

3090 * Note that this also covers the case where we opted to not use the

3091 * optimization in _bt_pendingfsm_init().

3092 */

3093 if (vstate->npendingpages == vstate->maxbufsize)

3094 return;

3095

3096 /* Consider enlarging buffer */

3097 if (vstate->npendingpages == vstate->bufsize)

3098 {

3099 int newbufsize = vstate->bufsize * 2;

3100

3101 /* Respect work_mem */

3102 if (newbufsize > vstate->maxbufsize)

3103 newbufsize = vstate->maxbufsize;

3104

3105 vstate->bufsize = newbufsize;

3106 vstate->pendingpages =

3107 repalloc(vstate->pendingpages,

3108 sizeof(BTPendingFSM) * vstate->bufsize);

3109 }

3110

3111 /* Save metadata for newly deleted page */

3112 vstate->pendingpages[vstate->npendingpages].target = target;

3113 vstate->pendingpages[vstate->npendingpages].safexid = safexid;

3114 vstate->npendingpages++;

3115}

BlockNumber

uint32 BlockNumber

Definition: block.h:31

InvalidBlockNumber

#define InvalidBlockNumber

Definition: block.h:33

BlockNumberIsValid

static bool BlockNumberIsValid(BlockNumber blockNumber)

Definition: block.h:71

Buffer

int Buffer

Definition: buf.h:23

InvalidBuffer

#define InvalidBuffer

Definition: buf.h:25

BufferGetBlockNumber

BlockNumber BufferGetBlockNumber(Buffer buffer)

Definition: bufmgr.c:4198

ReleaseAndReadBuffer

Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum)

Definition: bufmgr.c:2983

ExtendBufferedRel

Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)

Definition: bufmgr.c:858

ConditionalLockBuffer

bool ConditionalLockBuffer(Buffer buffer)

Definition: bufmgr.c:5598

ReleaseBuffer

void ReleaseBuffer(Buffer buffer)

Definition: bufmgr.c:5338

MarkBufferDirty

void MarkBufferDirty(Buffer buffer)

Definition: bufmgr.c:2921

LockBufferForCleanup

void LockBufferForCleanup(Buffer buffer)

Definition: bufmgr.c:5652

LockBuffer

void LockBuffer(Buffer buffer, int mode)

Definition: bufmgr.c:5572

ReadBuffer

Buffer ReadBuffer(Relation reln, BlockNumber blockNum)

Definition: bufmgr.c:758

BUFFER_LOCK_UNLOCK

#define BUFFER_LOCK_UNLOCK

Definition: bufmgr.h:196

RelationGetNumberOfBlocks

#define RelationGetNumberOfBlocks(reln)

Definition: bufmgr.h:283

BufferGetPage

static Page BufferGetPage(Buffer buffer)

Definition: bufmgr.h:417

BufferGetPageSize

static Size BufferGetPageSize(Buffer buffer)

Definition: bufmgr.h:406

EB_LOCK_FIRST

@ EB_LOCK_FIRST

Definition: bufmgr.h:87

BMR_REL

#define BMR_REL(p_rel)

Definition: bufmgr.h:111

BufferIsValid

static bool BufferIsValid(Buffer bufnum)

Definition: bufmgr.h:368

PageIndexMultiDelete

void PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)

Definition: bufpage.c:1160

PageIndexTupleOverwrite

bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)

Definition: bufpage.c:1404

PageIndexTupleDelete

void PageIndexTupleDelete(Page page, OffsetNumber offnum)

Definition: bufpage.c:1051

PageInit

void PageInit(Page page, Size pageSize, Size specialSize)

Definition: bufpage.c:42

PageHeader

PageHeaderData * PageHeader

Definition: bufpage.h:174

PageGetSpecialSize

static uint16 PageGetSpecialSize(const PageData *page)

Definition: bufpage.h:317

PageGetItem

static Item PageGetItem(const PageData *page, const ItemIdData *itemId)

Definition: bufpage.h:354

PageIsNew

static bool PageIsNew(const PageData *page)

Definition: bufpage.h:234

PageGetItemId

static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)

Definition: bufpage.h:244

PageSetLSN

static void PageSetLSN(Page page, XLogRecPtr lsn)

Definition: bufpage.h:391

Page

PageData * Page

Definition: bufpage.h:82

PageGetMaxOffsetNumber

static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)

Definition: bufpage.h:372

Min

#define Min(x, y)

Definition: c.h:1003

MAXALIGN

#define MAXALIGN(LEN)

Definition: c.h:810

uint8

uint8_t uint8

Definition: c.h:536

PG_USED_FOR_ASSERTS_ONLY

#define PG_USED_FOR_ASSERTS_ONLY

Definition: c.h:223

Max

#define Max(x, y)

Definition: c.h:997

uint16

uint16_t uint16

Definition: c.h:537

uint32

uint32_t uint32

Definition: c.h:538

MemSet

#define MemSet(start, val, len)

Definition: c.h:1019

TransactionId

uint32 TransactionId

Definition: c.h:657

Size

size_t Size

Definition: c.h:610

errmsg_internal

int errmsg_internal(const char *fmt,...)

Definition: elog.c:1161

errhint

int errhint(const char *fmt,...)

Definition: elog.c:1321

errcode

int errcode(int sqlerrcode)

Definition: elog.c:854

errmsg

int errmsg(const char *fmt,...)

Definition: elog.c:1071

LOG

#define LOG

Definition: elog.h:31

DEBUG2

#define DEBUG2

Definition: elog.h:29

PANIC

#define PANIC

Definition: elog.h:42

DEBUG1

#define DEBUG1

Definition: elog.h:30

ERROR

#define ERROR

Definition: elog.h:39

elog

#define elog(elevel,...)

Definition: elog.h:226

ereport

#define ereport(elevel,...)

Definition: elog.h:150

MaxAllocSize

#define MaxAllocSize

Definition: fe_memutils.h:22

work_mem

int work_mem

Definition: globals.c:131

Assert

Assert(PointerIsAligned(start, uint64))

GetFreeIndexPage

BlockNumber GetFreeIndexPage(Relation rel)

Definition: indexfsm.c:38

RecordFreeIndexPage

void RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)

Definition: indexfsm.c:52

indexfsm.h

CopyIndexTuple

IndexTuple CopyIndexTuple(IndexTuple source)

Definition: indextuple.c:547

int.h

pg_cmp_s16

static int pg_cmp_s16(int16 a, int16 b)

Definition: int.h:634

b

int b

Definition: isn.c:74

a

int a

Definition: isn.c:73

i

int i

Definition: isn.c:77

Item

Pointer Item

Definition: item.h:17

ItemPointerCompare

int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)

Definition: itemptr.c:51

ItemPointerEquals

bool ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)

Definition: itemptr.c:35

IndexTuple

IndexTupleData * IndexTuple

Definition: itup.h:53

IndexTupleData

struct IndexTupleData IndexTupleData

IndexTupleSize

static Size IndexTupleSize(const IndexTupleData *itup)

Definition: itup.h:71

MaxIndexTuplesPerPage

#define MaxIndexTuplesPerPage

Definition: itup.h:181

MemoryContextAlloc

void * MemoryContextAlloc(MemoryContext context, Size size)

Definition: mcxt.c:1229

repalloc

void * repalloc(void *pointer, Size size)

Definition: mcxt.c:1610

pfree

void pfree(void *pointer)

Definition: mcxt.c:1594

palloc

void * palloc(Size size)

Definition: mcxt.c:1365

memdebug.h

VALGRIND_MAKE_MEM_DEFINED

#define VALGRIND_MAKE_MEM_DEFINED(addr, size)

Definition: memdebug.h:26

VALGRIND_CHECK_MEM_IS_DEFINED

#define VALGRIND_CHECK_MEM_IS_DEFINED(addr, size)

Definition: memdebug.h:23

VALGRIND_MAKE_MEM_NOACCESS

#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)

Definition: memdebug.h:27

memutils.h

miscadmin.h

START_CRIT_SECTION

#define START_CRIT_SECTION()

Definition: miscadmin.h:149

CHECK_FOR_INTERRUPTS

#define CHECK_FOR_INTERRUPTS()

Definition: miscadmin.h:122

END_CRIT_SECTION

#define END_CRIT_SECTION()

Definition: miscadmin.h:151

_bt_update_posting

void _bt_update_posting(BTVacuumPosting vacposting)

Definition: nbtdedup.c:925

_bt_getstackbuf

Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, BlockNumber child)

Definition: nbtinsert.c:2329

_bt_relandgetbuf

Buffer _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)

Definition: nbtpage.c:1003

_bt_lock_subtree_parent

static bool _bt_lock_subtree_parent(Relation rel, Relation heaprel, BlockNumber child, BTStack stack, Buffer *subtreeparent, OffsetNumber *poffset, BlockNumber *topparent, BlockNumber *topparentrightsib)

Definition: nbtpage.c:2813

_bt_upgrademetapage

void _bt_upgrademetapage(Page page)

Definition: nbtpage.c:107

_bt_relbuf

void _bt_relbuf(Relation rel, Buffer buf)

Definition: nbtpage.c:1023

_bt_gettrueroot

Buffer _bt_gettrueroot(Relation rel)

Definition: nbtpage.c:580

_bt_getrootheight

int _bt_getrootheight(Relation rel)

Definition: nbtpage.c:675

_bt_pageinit

void _bt_pageinit(Page page, Size size)

Definition: nbtpage.c:1129

_bt_rightsib_halfdeadflag

static bool _bt_rightsib_halfdeadflag(Relation rel, BlockNumber leafrightsib)

Definition: nbtpage.c:1752

_bt_pagedel

void _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate)

Definition: nbtpage.c:1802

_bt_allocbuf

Buffer _bt_allocbuf(Relation rel, Relation heaprel)

Definition: nbtpage.c:869

_bt_delitems_vacuum

void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)

Definition: nbtpage.c:1154

_bt_leftsib_splitflag

static bool _bt_leftsib_splitflag(Relation rel, BlockNumber leftsib, BlockNumber target)

Definition: nbtpage.c:1695

_bt_checkpage

void _bt_checkpage(Relation rel, Buffer buf)

Definition: nbtpage.c:797

_bt_metaversion

void _bt_metaversion(Relation rel, bool *heapkeyspace, bool *allequalimage)

Definition: nbtpage.c:739

_bt_getmeta

static BTMetaPageData * _bt_getmeta(Relation rel, Buffer metabuf)

Definition: nbtpage.c:142

_bt_delitems_delete

static void _bt_delitems_delete(Relation rel, Buffer buf, TransactionId snapshotConflictHorizon, bool isCatalogRel, OffsetNumber *deletable, int ndeletable, BTVacuumPosting *updatable, int nupdatable)

Definition: nbtpage.c:1284

_bt_set_cleanup_info

void _bt_set_cleanup_info(Relation rel, BlockNumber num_delpages)

Definition: nbtpage.c:232

_bt_mark_page_halfdead

static bool _bt_mark_page_halfdead(Relation rel, Relation heaprel, Buffer leafbuf, BTStack stack)

Definition: nbtpage.c:2088

_bt_conditionallockbuf

bool _bt_conditionallockbuf(Relation rel, Buffer buf)

Definition: nbtpage.c:1093

_bt_getbuf

Buffer _bt_getbuf(Relation rel, BlockNumber blkno, int access)

Definition: nbtpage.c:845

_bt_unlockbuf

void _bt_unlockbuf(Relation rel, Buffer buf)

Definition: nbtpage.c:1070

_bt_upgradelockbufcleanup

void _bt_upgradelockbufcleanup(Relation rel, Buffer buf)

Definition: nbtpage.c:1109

_bt_initmetapage

void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)

Definition: nbtpage.c:67

_bt_delitems_delete_check

void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp *delstate)

Definition: nbtpage.c:1513

_bt_vacuum_needs_cleanup

bool _bt_vacuum_needs_cleanup(Relation rel)

Definition: nbtpage.c:179

_bt_delitems_update

static char * _bt_delitems_update(BTVacuumPosting *updatable, int nupdatable, OffsetNumber *updatedoffsets, Size *updatedbuflen, bool needswal)

Definition: nbtpage.c:1405

_bt_delitems_cmp

static int _bt_delitems_cmp(const void *a, const void *b)

Definition: nbtpage.c:1464

_bt_pendingfsm_finalize

void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate)

Definition: nbtpage.c:2996

_bt_lockbuf

void _bt_lockbuf(Relation rel, Buffer buf, int access)

Definition: nbtpage.c:1039

_bt_getroot

Buffer _bt_getroot(Relation rel, Relation heaprel, int access)

Definition: nbtpage.c:344

_bt_pendingfsm_init

void _bt_pendingfsm_init(Relation rel, BTVacState *vstate, bool cleanuponly)

Definition: nbtpage.c:2954

_bt_pendingfsm_add

static void _bt_pendingfsm_add(BTVacState *vstate, BlockNumber target, FullTransactionId safexid)

Definition: nbtpage.c:3063

_bt_unlink_halfdead_page

static bool _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno, bool *rightsib_empty, BTVacState *vstate)

Definition: nbtpage.c:2314

nbtree.h

P_ISHALFDEAD

#define P_ISHALFDEAD(opaque)

Definition: nbtree.h:225

BTreeTupleGetNPosting

static uint16 BTreeTupleGetNPosting(IndexTuple posting)

Definition: nbtree.h:519

BTPageGetMeta

#define BTPageGetMeta(p)

Definition: nbtree.h:122

P_ISLEAF

#define P_ISLEAF(opaque)

Definition: nbtree.h:221

BTPageGetDeleteXid

static FullTransactionId BTPageGetDeleteXid(Page page)

Definition: nbtree.h:261

BTREE_MIN_VERSION

#define BTREE_MIN_VERSION

Definition: nbtree.h:152

BTP_LEAF

#define BTP_LEAF

Definition: nbtree.h:77

BTP_HALF_DEAD

#define BTP_HALF_DEAD

Definition: nbtree.h:81

P_HIKEY

#define P_HIKEY

Definition: nbtree.h:368

BTreeTupleSetTopParent

static void BTreeTupleSetTopParent(IndexTuple leafhikey, BlockNumber blkno)

Definition: nbtree.h:627

P_ISMETA

#define P_ISMETA(opaque)

Definition: nbtree.h:224

P_LEFTMOST

#define P_LEFTMOST(opaque)

Definition: nbtree.h:219

BTPageGetOpaque

#define BTPageGetOpaque(page)

Definition: nbtree.h:74

P_ISDELETED

#define P_ISDELETED(opaque)

Definition: nbtree.h:223

BTREE_MAGIC

#define BTREE_MAGIC

Definition: nbtree.h:150

BTP_META

#define BTP_META

Definition: nbtree.h:80

BTREE_VERSION

#define BTREE_VERSION

Definition: nbtree.h:151

BTreeTupleGetTopParent

static BlockNumber BTreeTupleGetTopParent(IndexTuple leafhikey)

Definition: nbtree.h:621

BTPendingFSM

struct BTPendingFSM BTPendingFSM

BTP_ROOT

#define BTP_ROOT

Definition: nbtree.h:78

BTreeTupleSetDownLink

static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)

Definition: nbtree.h:563

P_FIRSTDATAKEY

#define P_FIRSTDATAKEY(opaque)

Definition: nbtree.h:370

P_ISROOT

#define P_ISROOT(opaque)

Definition: nbtree.h:222

P_NONE

#define P_NONE

Definition: nbtree.h:213

P_RIGHTMOST

#define P_RIGHTMOST(opaque)

Definition: nbtree.h:220

P_INCOMPLETE_SPLIT

#define P_INCOMPLETE_SPLIT(opaque)

Definition: nbtree.h:228

BTREE_METAPAGE

#define BTREE_METAPAGE

Definition: nbtree.h:149

BTreeTupleGetPostingN

static ItemPointer BTreeTupleGetPostingN(IndexTuple posting, int n)

Definition: nbtree.h:545

BT_READ

#define BT_READ

Definition: nbtree.h:730

BTPageIsRecyclable

static bool BTPageIsRecyclable(Page page, Relation heaprel)

Definition: nbtree.h:292

BTreeTupleGetDownLink

static BlockNumber BTreeTupleGetDownLink(IndexTuple pivot)

Definition: nbtree.h:557

P_IGNORE

#define P_IGNORE(opaque)

Definition: nbtree.h:226

BTreeTupleGetMaxHeapTID

static ItemPointer BTreeTupleGetMaxHeapTID(IndexTuple itup)

Definition: nbtree.h:665

BTreeTupleIsPosting

static bool BTreeTupleIsPosting(IndexTuple itup)

Definition: nbtree.h:493

BTPageSetDeleted

static void BTPageSetDeleted(Page page, FullTransactionId safexid)

Definition: nbtree.h:240

BTREE_NOVAC_VERSION

#define BTREE_NOVAC_VERSION

Definition: nbtree.h:153

BTreeTupleGetHeapTID

static ItemPointer BTreeTupleGetHeapTID(IndexTuple itup)

Definition: nbtree.h:639

BT_WRITE

#define BT_WRITE

Definition: nbtree.h:731

_bt_search

BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, int access)

Definition: nbtsearch.c:107

_bt_mkscankey

BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)

Definition: nbtutils.c:97

nbtxlog.h

SizeOfBtreeVacuum

#define SizeOfBtreeVacuum

Definition: nbtxlog.h:234

XLOG_BTREE_META_CLEANUP

#define XLOG_BTREE_META_CLEANUP

Definition: nbtxlog.h:41

SizeOfBtreeUpdate

#define SizeOfBtreeUpdate

Definition: nbtxlog.h:268

XLOG_BTREE_VACUUM

#define XLOG_BTREE_VACUUM

Definition: nbtxlog.h:39

SizeOfBtreeDelete

#define SizeOfBtreeDelete

Definition: nbtxlog.h:253

SizeOfBtreeMarkPageHalfDead

#define SizeOfBtreeMarkPageHalfDead

Definition: nbtxlog.h:291

XLOG_BTREE_UNLINK_PAGE

#define XLOG_BTREE_UNLINK_PAGE

Definition: nbtxlog.h:35

XLOG_BTREE_UNLINK_PAGE_META

#define XLOG_BTREE_UNLINK_PAGE_META

Definition: nbtxlog.h:36

SizeOfBtreeNewroot

#define SizeOfBtreeNewroot

Definition: nbtxlog.h:347

XLOG_BTREE_MARK_PAGE_HALFDEAD

#define XLOG_BTREE_MARK_PAGE_HALFDEAD

Definition: nbtxlog.h:38

XLOG_BTREE_REUSE_PAGE

#define XLOG_BTREE_REUSE_PAGE

Definition: nbtxlog.h:40

SizeOfBtreeUnlinkPage

#define SizeOfBtreeUnlinkPage

Definition: nbtxlog.h:328

SizeOfBtreeReusePage

#define SizeOfBtreeReusePage

Definition: nbtxlog.h:192

XLOG_BTREE_NEWROOT

#define XLOG_BTREE_NEWROOT

Definition: nbtxlog.h:37

XLOG_BTREE_DELETE

#define XLOG_BTREE_DELETE

Definition: nbtxlog.h:34

InvalidOffsetNumber

#define InvalidOffsetNumber

Definition: off.h:26

OffsetNumberIsValid

#define OffsetNumberIsValid(offsetNumber)

Definition: off.h:39

OffsetNumberNext

#define OffsetNumberNext(offsetNumber)

Definition: off.h:52

OffsetNumber

uint16 OffsetNumber

Definition: off.h:24

buf

static char * buf

Definition: pg_test_fsync.c:72

qsort

#define qsort(a, b, c, d)

Definition: port.h:479

postgres.h

PredicateLockPageCombine

void PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, BlockNumber newblkno)

Definition: predicate.c:3229

predicate.h

access

short access

Definition: preproc-type.c:36

GetOldestNonRemovableTransactionId

TransactionId GetOldestNonRemovableTransactionId(Relation rel)

Definition: procarray.c:1953

GlobalVisCheckRemovableFullXid

bool GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)

Definition: procarray.c:4248

procarray.h

RelationGetRelationName

#define RelationGetRelationName(relation)

Definition: rel.h:548

RelationIsAccessibleInLogicalDecoding

#define RelationIsAccessibleInLogicalDecoding(relation)

Definition: rel.h:693

RelationNeedsWAL

#define RelationNeedsWAL(relation)

Definition: rel.h:637

RelationUsesLocalBuffers

#define RelationUsesLocalBuffers(relation)

Definition: rel.h:646

MAIN_FORKNUM

@ MAIN_FORKNUM

Definition: relpath.h:58

pg_usleep

void pg_usleep(long microsec)

Definition: signal.c:53

snapmgr.h

BTMetaPageData

Definition: nbtree.h:105

BTMetaPageData::btm_last_cleanup_num_delpages

uint32 btm_last_cleanup_num_delpages

Definition: nbtree.h:115

BTMetaPageData::btm_level

uint32 btm_level

Definition: nbtree.h:109

BTMetaPageData::btm_last_cleanup_num_heap_tuples

float8 btm_last_cleanup_num_heap_tuples

Definition: nbtree.h:117

BTMetaPageData::btm_fastroot

BlockNumber btm_fastroot

Definition: nbtree.h:110

BTMetaPageData::btm_version

uint32 btm_version

Definition: nbtree.h:107

BTMetaPageData::btm_magic

uint32 btm_magic

Definition: nbtree.h:106

BTMetaPageData::btm_root

BlockNumber btm_root

Definition: nbtree.h:108

BTMetaPageData::btm_allequalimage

bool btm_allequalimage

Definition: nbtree.h:119

BTMetaPageData::btm_fastlevel

uint32 btm_fastlevel

Definition: nbtree.h:111

BTPageOpaqueData

Definition: nbtree.h:64

BTPageOpaqueData::btpo_next

BlockNumber btpo_next

Definition: nbtree.h:66

BTPageOpaqueData::btpo_prev

BlockNumber btpo_prev

Definition: nbtree.h:65

BTPageOpaqueData::btpo_flags

uint16 btpo_flags

Definition: nbtree.h:68

BTPageOpaqueData::btpo_level

uint32 btpo_level

Definition: nbtree.h:67

BTPageOpaqueData::btpo_cycleid

BTCycleId btpo_cycleid

Definition: nbtree.h:69

BTPendingFSM

Definition: nbtree.h:326

BTPendingFSM::safexid

FullTransactionId safexid

Definition: nbtree.h:328

BTPendingFSM::target

BlockNumber target

Definition: nbtree.h:327

BTScanInsertData

Definition: nbtree.h:796

BTScanInsertData::nextkey

bool nextkey

Definition: nbtree.h:800

BTScanInsertData::backward

bool backward

Definition: nbtree.h:801

BTStackData

Definition: nbtree.h:744

BTStackData::bts_blkno

BlockNumber bts_blkno

Definition: nbtree.h:745

BTStackData::bts_parent

struct BTStackData * bts_parent

Definition: nbtree.h:747

BTStackData::bts_offset

OffsetNumber bts_offset

Definition: nbtree.h:746

BTVacState

Definition: nbtree.h:332

BTVacState::stats

IndexBulkDeleteResult * stats

Definition: nbtree.h:334

BTVacState::pendingpages

BTPendingFSM * pendingpages

Definition: nbtree.h:345

BTVacState::npendingpages

int npendingpages

Definition: nbtree.h:346

BTVacState::info

IndexVacuumInfo * info

Definition: nbtree.h:333

BTVacState::bufsize

int bufsize

Definition: nbtree.h:343

BTVacState::maxbufsize

int maxbufsize

Definition: nbtree.h:344

BTVacuumPostingData

Definition: nbtree.h:915

BTVacuumPostingData::deletetids

uint16 deletetids[FLEXIBLE_ARRAY_MEMBER]

Definition: nbtree.h:922

BTVacuumPostingData::ndeletedtids

uint16 ndeletedtids

Definition: nbtree.h:921

BTVacuumPostingData::itup

IndexTuple itup

Definition: nbtree.h:917

BTVacuumPostingData::updatedoffset

OffsetNumber updatedoffset

Definition: nbtree.h:918

FullTransactionId

Definition: transam.h:66

IndexBulkDeleteResult

Definition: genam.h:103

IndexBulkDeleteResult::pages_deleted

BlockNumber pages_deleted

Definition: genam.h:109

IndexBulkDeleteResult::pages_newly_deleted

BlockNumber pages_newly_deleted

Definition: genam.h:108

IndexBulkDeleteResult::pages_free

BlockNumber pages_free

Definition: genam.h:110

IndexTupleData

Definition: itup.h:36

IndexTupleData::t_tid

ItemPointerData t_tid

Definition: itup.h:37

IndexTupleData::t_info

unsigned short t_info

Definition: itup.h:49

IndexVacuumInfo::heaprel

Relation heaprel

Definition: genam.h:74

ItemIdData

Definition: itemid.h:26

ItemPointerData

Definition: itemptr.h:37

RelationData

Definition: rel.h:56

RelationData::rd_amcache

void * rd_amcache

Definition: rel.h:229

RelationData::rd_indexcxt

MemoryContext rd_indexcxt

Definition: rel.h:204

RelationData::rd_locator

RelFileLocator rd_locator

Definition: rel.h:57

TM_IndexDeleteOp

Definition: tableam.h:245

TM_IndexDeleteOp::status

TM_IndexStatus * status

Definition: tableam.h:254

TM_IndexDeleteOp::ndeltids

int ndeltids

Definition: tableam.h:252

TM_IndexDeleteOp::bottomup

bool bottomup

Definition: tableam.h:248

TM_IndexDeleteOp::deltids

TM_IndexDelete * deltids

Definition: tableam.h:253

TM_IndexDelete

Definition: tableam.h:211

TM_IndexDelete::tid

ItemPointerData tid

Definition: tableam.h:212

TM_IndexDelete::id

int16 id

Definition: tableam.h:213

TM_IndexStatus

Definition: tableam.h:217

TM_IndexStatus::knowndeletable

bool knowndeletable

Definition: tableam.h:219

TM_IndexStatus::idxoffnum

OffsetNumber idxoffnum

Definition: tableam.h:218

xl_btree_delete

Definition: nbtxlog.h:237

xl_btree_delete::snapshotConflictHorizon

TransactionId snapshotConflictHorizon

Definition: nbtxlog.h:238

xl_btree_delete::isCatalogRel

bool isCatalogRel

Definition: nbtxlog.h:241

xl_btree_delete::ndeleted

uint16 ndeleted

Definition: nbtxlog.h:239

xl_btree_delete::nupdated

uint16 nupdated

Definition: nbtxlog.h:240

xl_btree_mark_page_halfdead

Definition: nbtxlog.h:281

xl_btree_mark_page_halfdead::poffset

OffsetNumber poffset

Definition: nbtxlog.h:282

xl_btree_mark_page_halfdead::leafblk

BlockNumber leafblk

Definition: nbtxlog.h:285

xl_btree_mark_page_halfdead::leftblk

BlockNumber leftblk

Definition: nbtxlog.h:286

xl_btree_mark_page_halfdead::topparent

BlockNumber topparent

Definition: nbtxlog.h:288

xl_btree_mark_page_halfdead::rightblk

BlockNumber rightblk

Definition: nbtxlog.h:287

xl_btree_metadata

Definition: nbtxlog.h:47

xl_btree_metadata::level

uint32 level

Definition: nbtxlog.h:50

xl_btree_metadata::version

uint32 version

Definition: nbtxlog.h:48

xl_btree_metadata::allequalimage

bool allequalimage

Definition: nbtxlog.h:54

xl_btree_metadata::fastroot

BlockNumber fastroot

Definition: nbtxlog.h:51

xl_btree_metadata::fastlevel

uint32 fastlevel

Definition: nbtxlog.h:52

xl_btree_metadata::root

BlockNumber root

Definition: nbtxlog.h:49

xl_btree_metadata::last_cleanup_num_delpages

uint32 last_cleanup_num_delpages

Definition: nbtxlog.h:53

xl_btree_newroot

Definition: nbtxlog.h:342

xl_btree_newroot::level

uint32 level

Definition: nbtxlog.h:344

xl_btree_newroot::rootblk

BlockNumber rootblk

Definition: nbtxlog.h:343

xl_btree_reuse_page

Definition: nbtxlog.h:184

xl_btree_reuse_page::snapshotConflictHorizon

FullTransactionId snapshotConflictHorizon

Definition: nbtxlog.h:187

xl_btree_reuse_page::locator

RelFileLocator locator

Definition: nbtxlog.h:185

xl_btree_reuse_page::block

BlockNumber block

Definition: nbtxlog.h:186

xl_btree_reuse_page::isCatalogRel

bool isCatalogRel

Definition: nbtxlog.h:188

xl_btree_unlink_page

Definition: nbtxlog.h:308

xl_btree_unlink_page::leftsib

BlockNumber leftsib

Definition: nbtxlog.h:309

xl_btree_unlink_page::safexid

FullTransactionId safexid

Definition: nbtxlog.h:312

xl_btree_unlink_page::rightsib

BlockNumber rightsib

Definition: nbtxlog.h:310

xl_btree_unlink_page::leaftopparent

BlockNumber leaftopparent

Definition: nbtxlog.h:323

xl_btree_unlink_page::leafleftsib

BlockNumber leafleftsib

Definition: nbtxlog.h:321

xl_btree_unlink_page::level

uint32 level

Definition: nbtxlog.h:311

xl_btree_unlink_page::leafrightsib

BlockNumber leafrightsib

Definition: nbtxlog.h:322

xl_btree_update

Definition: nbtxlog.h:262

xl_btree_update::ndeletedtids

uint16 ndeletedtids

Definition: nbtxlog.h:263

xl_btree_vacuum

Definition: nbtxlog.h:221

xl_btree_vacuum::ndeleted

uint16 ndeleted

Definition: nbtxlog.h:222

xl_btree_vacuum::nupdated

uint16 nupdated

Definition: nbtxlog.h:223

tableam.h

table_index_delete_tuples

static TransactionId table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)

Definition: tableam.h:1321

transam.h

InvalidTransactionId

#define InvalidTransactionId

Definition: transam.h:31

FullTransactionIdFollowsOrEquals

#define FullTransactionIdFollowsOrEquals(a, b)

Definition: transam.h:54

ReadNextFullTransactionId

FullTransactionId ReadNextFullTransactionId(void)

Definition: varsup.c:288

xlog.h

XLogStandbyInfoActive

#define XLogStandbyInfoActive()

Definition: xlog.h:123

XLogRecPtr

uint64 XLogRecPtr

Definition: xlogdefs.h:21

XLogInsert

XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)

Definition: xloginsert.c:474

XLogRegisterBufData

void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)

Definition: xloginsert.c:405

XLogRegisterData

void XLogRegisterData(const void *data, uint32 len)

Definition: xloginsert.c:364

XLogRegisterBuffer

void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)

Definition: xloginsert.c:242

XLogBeginInsert

void XLogBeginInsert(void)

Definition: xloginsert.c:149

xloginsert.h

REGBUF_STANDARD

#define REGBUF_STANDARD

Definition: xloginsert.h:35

REGBUF_WILL_INIT

#define REGBUF_WILL_INIT

Definition: xloginsert.h:34

PostgreSQL Source Code: src/backend/access/nbtree/nbtpage.c Source File