[フレーム]

nbtsort.c

1/*-------------------------------------------------------------------------

2 *

3 * nbtsort.c

4 * Build a btree from sorted input by loading leaf pages sequentially.

5 *

6 * NOTES

7 *

8 * We use tuplesort.c to sort the given index tuples into order.

9 * Then we scan the index tuples in order and build the btree pages

10 * for each level. We load source tuples into leaf-level pages.

11 * Whenever we fill a page at one level, we add a link to it to its

12 * parent level (starting a new parent level if necessary). When

13 * done, we write out each final page on each level, adding it to

14 * its parent level. When we have only one page on a level, it must be

15 * the root -- it can be attached to the btree metapage and we are done.

16 *

17 * It is not wise to pack the pages entirely full, since then *any*

18 * insertion would cause a split (and not only of the leaf page; the need

19 * for a split would cascade right up the tree). The steady-state load

20 * factor for btrees is usually estimated at 70%. We choose to pack leaf

21 * pages to the user-controllable fill factor (default 90%) while upper pages

22 * are always packed to 70%. This gives us reasonable density (there aren't

23 * many upper pages if the keys are reasonable-size) without risking a lot of

24 * cascading splits during early insertions.

25 *

26 * We use the bulk smgr loading facility to bypass the buffer cache and

27 * WAL-log the pages efficiently.

28 *

29 * This code isn't concerned about the FSM at all. The caller is responsible

30 * for initializing that.

31 *

34 *

35 * IDENTIFICATION

36 * src/backend/access/nbtree/nbtsort.c

37 *

38 *-------------------------------------------------------------------------

39 */

40

41#include "postgres.h"

42

43#include "access/nbtree.h"

44#include "access/parallel.h"

45#include "access/relscan.h"

46#include "access/table.h"

47#include "access/tableam.h"

48#include "access/xact.h"

49#include "catalog/index.h"

50#include "commands/progress.h"

51#include "executor/instrument.h"

52#include "miscadmin.h"

53#include "pgstat.h"

54#include "storage/bulk_write.h"

55#include "tcop/tcopprot.h"

56#include "utils/rel.h"

57#include "utils/sortsupport.h"

58#include "utils/tuplesort.h"

59

60

61/* Magic numbers for parallel state sharing */

62 #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)

63 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)

64 #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)

65 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)

66 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)

67 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)

68

69/*

70 * DISABLE_LEADER_PARTICIPATION disables the leader's participation in

71 * parallel index builds. This may be useful as a debugging aid.

72#undef DISABLE_LEADER_PARTICIPATION

73 */

74

75/*

76 * Status record for spooling/sorting phase. (Note we may have two of

77 * these due to the special requirements for uniqueness-checking with

78 * dead tuples.)

79 */

80 typedef struct BTSpool

81{

82 Tuplesortstate *sortstate; /* state data for tuplesort.c */

83 Relation heap;

84 Relation index;

85 bool isunique;

86 bool nulls_not_distinct;

87 } BTSpool;

88

89/*

90 * Status for index builds performed in parallel. This is allocated in a

91 * dynamic shared memory segment. Note that there is a separate tuplesort TOC

92 * entry, private to tuplesort.c but allocated by this module on its behalf.

93 */

94 typedef struct BTShared

95{

96 /*

97 * These fields are not modified during the sort. They primarily exist

98 * for the benefit of worker processes that need to create BTSpool state

99 * corresponding to that used by the leader.

100 */

101 Oid heaprelid;

102 Oid indexrelid;

103 bool isunique;

104 bool nulls_not_distinct;

105 bool isconcurrent;

106 int scantuplesortstates;

107

108 /* Query ID, for report in worker processes */

109 int64 queryid;

110

111 /*

112 * workersdonecv is used to monitor the progress of workers. All parallel

113 * participants must indicate that they are done before leader can use

114 * mutable state that workers maintain during scan (and before leader can

115 * proceed to tuplesort_performsort()).

116 */

117 ConditionVariable workersdonecv;

118

119 /*

120 * mutex protects all fields before heapdesc.

121 *

122 * These fields contain status information of interest to B-Tree index

123 * builds that must work just the same when an index is built in parallel.

124 */

125 slock_t mutex;

126

127 /*

128 * Mutable state that is maintained by workers, and reported back to

129 * leader at end of parallel scan.

130 *

131 * nparticipantsdone is number of worker processes finished.

132 *

133 * reltuples is the total number of input heap tuples.

134 *

135 * havedead indicates if RECENTLY_DEAD tuples were encountered during

136 * build.

137 *

138 * indtuples is the total number of tuples that made it into the index.

139 *

140 * brokenhotchain indicates if any worker detected a broken HOT chain

141 * during build.

142 */

143 int nparticipantsdone;

144 double reltuples;

145 bool havedead;

146 double indtuples;

147 bool brokenhotchain;

148

149 /*

150 * ParallelTableScanDescData data follows. Can't directly embed here, as

151 * implementations of the parallel table scan desc interface might need

152 * stronger alignment.

153 */

154 } BTShared;

155

156/*

157 * Return pointer to a BTShared's parallel table scan.

158 *

159 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just

160 * MAXALIGN.

161 */

162 #define ParallelTableScanFromBTShared(shared) \

163 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))

164

165/*

166 * Status for leader in parallel index build.

167 */

168 typedef struct BTLeader

169{

170 /* parallel context itself */

171 ParallelContext *pcxt;

172

173 /*

174 * nparticipanttuplesorts is the exact number of worker processes

175 * successfully launched, plus one leader process if it participates as a

176 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader

177 * participating as a worker).

178 */

179 int nparticipanttuplesorts;

180

181 /*

182 * Leader process convenience pointers to shared state (leader avoids TOC

183 * lookups).

184 *

185 * btshared is the shared state for entire build. sharedsort is the

186 * shared, tuplesort-managed state passed to each process tuplesort.

187 * sharedsort2 is the corresponding btspool2 shared state, used only when

188 * building unique indexes. snapshot is the snapshot used by the scan iff

189 * an MVCC snapshot is required.

190 */

191 BTShared *btshared;

192 Sharedsort *sharedsort;

193 Sharedsort *sharedsort2;

194 Snapshot snapshot;

195 WalUsage *walusage;

196 BufferUsage *bufferusage;

197 } BTLeader;

198

199/*

200 * Working state for btbuild and its callback.

201 *

202 * When parallel CREATE INDEX is used, there is a BTBuildState for each

203 * participant.

204 */

205 typedef struct BTBuildState

206{

207 bool isunique;

208 bool nulls_not_distinct;

209 bool havedead;

210 Relation heap;

211 BTSpool *spool;

212

213 /*

214 * spool2 is needed only when the index is a unique index. Dead tuples are

215 * put into spool2 instead of spool in order to avoid uniqueness check.

216 */

217 BTSpool *spool2;

218 double indtuples;

219

220 /*

221 * btleader is only present when a parallel index build is performed, and

222 * only in the leader process. (Actually, only the leader has a

223 * BTBuildState. Workers have their own spool and spool2, though.)

224 */

225 BTLeader *btleader;

226 } BTBuildState;

227

228/*

229 * Status record for a btree page being built. We have one of these

230 * for each active tree level.

231 */

232 typedef struct BTPageState

233{

234 BulkWriteBuffer btps_buf; /* workspace for page building */

235 BlockNumber btps_blkno; /* block # to write this page at */

236 IndexTuple btps_lowkey; /* page's strict lower bound pivot tuple */

237 OffsetNumber btps_lastoff; /* last item offset loaded */

238 Size btps_lastextra; /* last item's extra posting list space */

239 uint32 btps_level; /* tree level (0 = leaf) */

240 Size btps_full; /* "full" if less than this much free space */

241 struct BTPageState *btps_next; /* link to parent level, if any */

242 } BTPageState;

243

244/*

245 * Overall status record for index writing phase.

246 */

247 typedef struct BTWriteState

248{

249 Relation heap;

250 Relation index;

251 BulkWriteState *bulkstate;

252 BTScanInsert inskey; /* generic insertion scankey */

253 BlockNumber btws_pages_alloced; /* # pages allocated */

254 } BTWriteState;

255

256

257static double _bt_spools_heapscan(Relation heap, Relation index,

258 BTBuildState *buildstate, IndexInfo *indexInfo);

259static void _bt_spooldestroy(BTSpool *btspool);

260static void _bt_spool(BTSpool *btspool, ItemPointer self,

261 Datum *values, bool *isnull);

262static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2);

263static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values,

264 bool *isnull, bool tupleIsAlive, void *state);

265static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level);

266static BTPageState *_bt_pagestate(BTWriteState *wstate, uint32 level);

267static void _bt_slideleft(Page rightmostpage);

268static void _bt_sortaddtup(Page page, Size itemsize,

269 IndexTuple itup, OffsetNumber itup_off,

270 bool newfirstdataitem);

271static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,

272 IndexTuple itup, Size truncextra);

273static void _bt_sort_dedup_finish_pending(BTWriteState *wstate,

274 BTPageState *state,

275 BTDedupState dstate);

276static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);

277static void _bt_load(BTWriteState *wstate,

278 BTSpool *btspool, BTSpool *btspool2);

279static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,

280 int request);

281static void _bt_end_parallel(BTLeader *btleader);

282static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot);

283static double _bt_parallel_heapscan(BTBuildState *buildstate,

284 bool *brokenhotchain);

285static void _bt_leader_participate_as_worker(BTBuildState *buildstate);

286static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,

287 BTShared *btshared, Sharedsort *sharedsort,

288 Sharedsort *sharedsort2, int sortmem,

289 bool progress);

290

291

292/*

293 * btbuild() -- build a new btree index.

294 */

295IndexBuildResult *

296 btbuild(Relation heap, Relation index, IndexInfo *indexInfo)

297{

298 IndexBuildResult *result;

299 BTBuildState buildstate;

300 double reltuples;

301

302#ifdef BTREE_BUILD_STATS

303 if (log_btree_build_stats)

304 ResetUsage();

305#endif /* BTREE_BUILD_STATS */

306

307 buildstate.isunique = indexInfo->ii_Unique;

308 buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct;

309 buildstate.havedead = false;

310 buildstate.heap = heap;

311 buildstate.spool = NULL;

312 buildstate.spool2 = NULL;

313 buildstate.indtuples = 0;

314 buildstate.btleader = NULL;

315

316 /*

317 * We expect to be called exactly once for any index relation. If that's

318 * not the case, big trouble's what we have.

319 */

320 if (RelationGetNumberOfBlocks(index) != 0)

321 elog(ERROR, "index \"%s\" already contains data",

322 RelationGetRelationName(index));

323

324 reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo);

325

326 /*

327 * Finish the build by (1) completing the sort of the spool file, (2)

328 * inserting the sorted tuples into btree pages and (3) building the upper

329 * levels. Finally, it may also be necessary to end use of parallelism.

330 */

331 _bt_leafbuild(buildstate.spool, buildstate.spool2);

332 _bt_spooldestroy(buildstate.spool);

333 if (buildstate.spool2)

334 _bt_spooldestroy(buildstate.spool2);

335 if (buildstate.btleader)

336 _bt_end_parallel(buildstate.btleader);

337

338 result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));

339

340 result->heap_tuples = reltuples;

341 result->index_tuples = buildstate.indtuples;

342

343#ifdef BTREE_BUILD_STATS

344 if (log_btree_build_stats)

345 {

346 ShowUsage("BTREE BUILD STATS");

347 ResetUsage();

348 }

349#endif /* BTREE_BUILD_STATS */

350

351 return result;

352}

353

354/*

355 * Create and initialize one or two spool structures, and save them in caller's

356 * buildstate argument. May also fill-in fields within indexInfo used by index

357 * builds.

358 *

359 * Scans the heap, possibly in parallel, filling spools with IndexTuples. This

360 * routine encapsulates all aspects of managing parallelism. Caller need only

361 * call _bt_end_parallel() in parallel case after it is done with spool/spool2.

362 *

363 * Returns the total number of heap tuples scanned.

364 */

365static double

366 _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate,

367 IndexInfo *indexInfo)

368{

369 BTSpool *btspool = (BTSpool *) palloc0(sizeof(BTSpool));

370 SortCoordinate coordinate = NULL;

371 double reltuples = 0;

372

373 /*

374 * We size the sort area as maintenance_work_mem rather than work_mem to

375 * speed index creation. This should be OK since a single backend can't

376 * run multiple index creations in parallel (see also: notes on

377 * parallelism and maintenance_work_mem below).

378 */

379 btspool->heap = heap;

380 btspool->index = index;

381 btspool->isunique = indexInfo->ii_Unique;

382 btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct;

383

384 /* Save as primary spool */

385 buildstate->spool = btspool;

386

387 /* Report table scan phase started */

388 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,

389 PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN);

390

391 /* Attempt to launch parallel worker scan when required */

392 if (indexInfo->ii_ParallelWorkers > 0)

393 _bt_begin_parallel(buildstate, indexInfo->ii_Concurrent,

394 indexInfo->ii_ParallelWorkers);

395

396 /*

397 * If parallel build requested and at least one worker process was

398 * successfully launched, set up coordination state

399 */

400 if (buildstate->btleader)

401 {

402 coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));

403 coordinate->isWorker = false;

404 coordinate->nParticipants =

405 buildstate->btleader->nparticipanttuplesorts;

406 coordinate->sharedsort = buildstate->btleader->sharedsort;

407 }

408

409 /*

410 * Begin serial/leader tuplesort.

411 *

412 * In cases where parallelism is involved, the leader receives the same

413 * share of maintenance_work_mem as a serial sort (it is generally treated

414 * in the same way as a serial sort once we return). Parallel worker

415 * Tuplesortstates will have received only a fraction of

416 * maintenance_work_mem, though.

417 *

418 * We rely on the lifetime of the Leader Tuplesortstate almost not

419 * overlapping with any worker Tuplesortstate's lifetime. There may be

420 * some small overlap, but that's okay because we rely on leader

421 * Tuplesortstate only allocating a small, fixed amount of memory here.

422 * When its tuplesort_performsort() is called (by our caller), and

423 * significant amounts of memory are likely to be used, all workers must

424 * have already freed almost all memory held by their Tuplesortstates

425 * (they are about to go away completely, too). The overall effect is

426 * that maintenance_work_mem always represents an absolute high watermark

427 * on the amount of memory used by a CREATE INDEX operation, regardless of

428 * the use of parallelism or any other factor.

429 */

430 buildstate->spool->sortstate =

431 tuplesort_begin_index_btree(heap, index, buildstate->isunique,

432 buildstate->nulls_not_distinct,

433 maintenance_work_mem, coordinate,

434 TUPLESORT_NONE);

435

436 /*

437 * If building a unique index, put dead tuples in a second spool to keep

438 * them out of the uniqueness check. We expect that the second spool (for

439 * dead tuples) won't get very full, so we give it only work_mem.

440 */

441 if (indexInfo->ii_Unique)

442 {

443 BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));

444 SortCoordinate coordinate2 = NULL;

445

446 /* Initialize secondary spool */

447 btspool2->heap = heap;

448 btspool2->index = index;

449 btspool2->isunique = false;

450 /* Save as secondary spool */

451 buildstate->spool2 = btspool2;

452

453 if (buildstate->btleader)

454 {

455 /*

456 * Set up non-private state that is passed to

457 * tuplesort_begin_index_btree() about the basic high level

458 * coordination of a parallel sort.

459 */

460 coordinate2 = (SortCoordinate) palloc0(sizeof(SortCoordinateData));

461 coordinate2->isWorker = false;

462 coordinate2->nParticipants =

463 buildstate->btleader->nparticipanttuplesorts;

464 coordinate2->sharedsort = buildstate->btleader->sharedsort2;

465 }

466

467 /*

468 * We expect that the second one (for dead tuples) won't get very

469 * full, so we give it only work_mem

470 */

471 buildstate->spool2->sortstate =

472 tuplesort_begin_index_btree(heap, index, false, false, work_mem,

473 coordinate2, TUPLESORT_NONE);

474 }

475

476 /* Fill spool using either serial or parallel heap scan */

477 if (!buildstate->btleader)

478 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,

479 _bt_build_callback, buildstate,

480 NULL);

481 else

482 reltuples = _bt_parallel_heapscan(buildstate,

483 &indexInfo->ii_BrokenHotChain);

484

485 /*

486 * Set the progress target for the next phase. Reset the block number

487 * values set by table_index_build_scan

488 */

489 {

490 const int progress_index[] = {

491 PROGRESS_CREATEIDX_TUPLES_TOTAL,

492 PROGRESS_SCAN_BLOCKS_TOTAL,

493 PROGRESS_SCAN_BLOCKS_DONE

494 };

495 const int64 progress_vals[] = {

496 buildstate->indtuples,

497 0, 0

498 };

499

500 pgstat_progress_update_multi_param(3, progress_index, progress_vals);

501 }

502

503 /* okay, all heap tuples are spooled */

504 if (buildstate->spool2 && !buildstate->havedead)

505 {

506 /* spool2 turns out to be unnecessary */

507 _bt_spooldestroy(buildstate->spool2);

508 buildstate->spool2 = NULL;

509 }

510

511 return reltuples;

512}

513

514/*

515 * clean up a spool structure and its substructures.

516 */

517static void

518 _bt_spooldestroy(BTSpool *btspool)

519{

520 tuplesort_end(btspool->sortstate);

521 pfree(btspool);

522}

523

524/*

525 * spool an index entry into the sort file.

526 */

527static void

528 _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)

529{

530 tuplesort_putindextuplevalues(btspool->sortstate, btspool->index,

531 self, values, isnull);

532}

533

534/*

535 * given a spool loaded by successive calls to _bt_spool,

536 * create an entire btree.

537 */

538static void

539 _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)

540{

541 BTWriteState wstate;

542

543#ifdef BTREE_BUILD_STATS

544 if (log_btree_build_stats)

545 {

546 ShowUsage("BTREE BUILD (Spool) STATISTICS");

547 ResetUsage();

548 }

549#endif /* BTREE_BUILD_STATS */

550

551 /* Execute the sort */

552 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,

553 PROGRESS_BTREE_PHASE_PERFORMSORT_1);

554 tuplesort_performsort(btspool->sortstate);

555 if (btspool2)

556 {

557 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,

558 PROGRESS_BTREE_PHASE_PERFORMSORT_2);

559 tuplesort_performsort(btspool2->sortstate);

560 }

561

562 wstate.heap = btspool->heap;

563 wstate.index = btspool->index;

564 wstate.inskey = _bt_mkscankey(wstate.index, NULL);

565 /* _bt_mkscankey() won't set allequalimage without metapage */

566 wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true);

567

568 /* reserve the metapage */

569 wstate.btws_pages_alloced = BTREE_METAPAGE + 1;

570

571 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,

572 PROGRESS_BTREE_PHASE_LEAF_LOAD);

573 _bt_load(&wstate, btspool, btspool2);

574}

575

576/*

577 * Per-tuple callback for table_index_build_scan

578 */

579static void

580 _bt_build_callback(Relation index,

581 ItemPointer tid,

582 Datum *values,

583 bool *isnull,

584 bool tupleIsAlive,

585 void *state)

586{

587 BTBuildState *buildstate = (BTBuildState *) state;

588

589 /*

590 * insert the index tuple into the appropriate spool file for subsequent

591 * processing

592 */

593 if (tupleIsAlive || buildstate->spool2 == NULL)

594 _bt_spool(buildstate->spool, tid, values, isnull);

595 else

596 {

597 /* dead tuples are put into spool2 */

598 buildstate->havedead = true;

599 _bt_spool(buildstate->spool2, tid, values, isnull);

600 }

601

602 buildstate->indtuples += 1;

603}

604

605/*

606 * allocate workspace for a new, clean btree page, not linked to any siblings.

607 */

608static BulkWriteBuffer

609 _bt_blnewpage(BTWriteState *wstate, uint32 level)

610{

611 BulkWriteBuffer buf;

612 Page page;

613 BTPageOpaque opaque;

614

615 buf = smgr_bulk_get_buf(wstate->bulkstate);

616 page = (Page) buf;

617

618 /* Zero the page and set up standard page header info */

619 _bt_pageinit(page, BLCKSZ);

620

621 /* Initialize BT opaque state */

622 opaque = BTPageGetOpaque(page);

623 opaque->btpo_prev = opaque->btpo_next = P_NONE;

624 opaque->btpo_level = level;

625 opaque->btpo_flags = (level > 0) ? 0 : BTP_LEAF;

626 opaque->btpo_cycleid = 0;

627

628 /* Make the P_HIKEY line pointer appear allocated */

629 ((PageHeader) page)->pd_lower += sizeof(ItemIdData);

630

631 return buf;

632}

633

634/*

635 * emit a completed btree page, and release the working storage.

636 */

637static void

638 _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)

639{

640 smgr_bulk_write(wstate->bulkstate, blkno, buf, true);

641 /* smgr_bulk_write took ownership of 'buf' */

642}

643

644/*

645 * allocate and initialize a new BTPageState. the returned structure

646 * is suitable for immediate use by _bt_buildadd.

647 */

648static BTPageState *

649 _bt_pagestate(BTWriteState *wstate, uint32 level)

650{

651 BTPageState *state = (BTPageState *) palloc0(sizeof(BTPageState));

652

653 /* create initial page for level */

654 state->btps_buf = _bt_blnewpage(wstate, level);

655

656 /* and assign it a page position */

657 state->btps_blkno = wstate->btws_pages_alloced++;

658

659 state->btps_lowkey = NULL;

660 /* initialize lastoff so first item goes into P_FIRSTKEY */

661 state->btps_lastoff = P_HIKEY;

662 state->btps_lastextra = 0;

663 state->btps_level = level;

664 /* set "full" threshold based on level. See notes at head of file. */

665 if (level > 0)

666 state->btps_full = (BLCKSZ * (100 - BTREE_NONLEAF_FILLFACTOR) / 100);

667 else

668 state->btps_full = BTGetTargetPageFreeSpace(wstate->index);

669

670 /* no parent level, yet */

671 state->btps_next = NULL;

672

673 return state;

674}

675

676/*

677 * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to

678 * P_HIKEY, overwriting P_HIKEY).

679 *

680 * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the

681 * rightmost page on its level is not supposed to get a high key. Now that

682 * it's clear that this page is a rightmost page, remove the unneeded empty

683 * P_HIKEY line pointer space.

684 */

685static void

686 _bt_slideleft(Page rightmostpage)

687{

688 OffsetNumber off;

689 OffsetNumber maxoff;

690 ItemId previi;

691

692 maxoff = PageGetMaxOffsetNumber(rightmostpage);

693 Assert(maxoff >= P_FIRSTKEY);

694 previi = PageGetItemId(rightmostpage, P_HIKEY);

695 for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off))

696 {

697 ItemId thisii = PageGetItemId(rightmostpage, off);

698

699 *previi = *thisii;

700 previi = thisii;

701 }

702 ((PageHeader) rightmostpage)->pd_lower -= sizeof(ItemIdData);

703}

704

705/*

706 * Add an item to a page being built.

707 *

708 * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant

709 * raises an error directly.

710 *

711 * Note that our nbtsort.c caller does not know yet if the page will be

712 * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by

713 * caller. Page that turns out to be the rightmost on its level is fixed by

714 * calling _bt_slideleft().

715 */

716static void

717 _bt_sortaddtup(Page page,

718 Size itemsize,

719 IndexTuple itup,

720 OffsetNumber itup_off,

721 bool newfirstdataitem)

722{

723 IndexTupleData trunctuple;

724

725 if (newfirstdataitem)

726 {

727 trunctuple = *itup;

728 trunctuple.t_info = sizeof(IndexTupleData);

729 BTreeTupleSetNAtts(&trunctuple, 0, false);

730 itup = &trunctuple;

731 itemsize = sizeof(IndexTupleData);

732 }

733

734 if (PageAddItem(page, (Item) itup, itemsize, itup_off,

735 false, false) == InvalidOffsetNumber)

736 elog(ERROR, "failed to add item to the index page");

737}

738

739/*----------

740 * Add an item to a disk page from the sort output (or add a posting list

741 * item formed from the sort output).

742 *

743 * We must be careful to observe the page layout conventions of nbtsearch.c:

744 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.

745 * - on non-leaf pages, the key portion of the first item need not be

746 * stored, we should store only the link.

747 *

748 * A leaf page being built looks like:

749 *

750 * +----------------+---------------------------------+

751 * | PageHeaderData | linp0 linp1 linp2 ... |

752 * +-----------+----+---------------------------------+

753 * | ... linpN | |

754 * +-----------+--------------------------------------+

755 * | ^ last |

756 * | |

757 * +-------------+------------------------------------+

758 * | | itemN ... |

759 * +-------------+------------------+-----------------+

760 * | ... item3 item2 item1 | "special space" |

761 * +--------------------------------+-----------------+

762 *

763 * Contrast this with the diagram in bufpage.h; note the mismatch

764 * between linps and items. This is because we reserve linp0 as a

765 * placeholder for the pointer to the "high key" item; when we have

766 * filled up the page, we will set linp0 to point to itemN and clear

767 * linpN. On the other hand, if we find this is the last (rightmost)

768 * page, we leave the items alone and slide the linp array over. If

769 * the high key is to be truncated, offset 1 is deleted, and we insert

770 * the truncated high key at offset 1.

771 *

772 * 'last' pointer indicates the last offset added to the page.

773 *

774 * 'truncextra' is the size of the posting list in itup, if any. This

775 * information is stashed for the next call here, when we may benefit

776 * from considering the impact of truncating away the posting list on

777 * the page before deciding to finish the page off. Posting lists are

778 * often relatively large, so it is worth going to the trouble of

779 * accounting for the saving from truncating away the posting list of

780 * the tuple that becomes the high key (that may be the only way to

781 * get close to target free space on the page). Note that this is

782 * only used for the soft fillfactor-wise limit, not the critical hard

783 * limit.

784 *----------

785 */

786static void

787 _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup,

788 Size truncextra)

789{

790 BulkWriteBuffer nbuf;

791 Page npage;

792 BlockNumber nblkno;

793 OffsetNumber last_off;

794 Size last_truncextra;

795 Size pgspc;

796 Size itupsz;

797 bool isleaf;

798

799 /*

800 * This is a handy place to check for cancel interrupts during the btree

801 * load phase of index creation.

802 */

803 CHECK_FOR_INTERRUPTS();

804

805 nbuf = state->btps_buf;

806 npage = (Page) nbuf;

807 nblkno = state->btps_blkno;

808 last_off = state->btps_lastoff;

809 last_truncextra = state->btps_lastextra;

810 state->btps_lastextra = truncextra;

811

812 pgspc = PageGetFreeSpace(npage);

813 itupsz = IndexTupleSize(itup);

814 itupsz = MAXALIGN(itupsz);

815 /* Leaf case has slightly different rules due to suffix truncation */

816 isleaf = (state->btps_level == 0);

817

818 /*

819 * Check whether the new item can fit on a btree page on current level at

820 * all.

821 *

822 * Every newly built index will treat heap TID as part of the keyspace,

823 * which imposes the requirement that new high keys must occasionally have

824 * a heap TID appended within _bt_truncate(). That may leave a new pivot

825 * tuple one or two MAXALIGN() quantums larger than the original

826 * firstright tuple it's derived from. v4 deals with the problem by

827 * decreasing the limit on the size of tuples inserted on the leaf level

828 * by the same small amount. Enforce the new v4+ limit on the leaf level,

829 * and the old limit on internal levels, since pivot tuples may need to

830 * make use of the reserved space. This should never fail on internal

831 * pages.

832 */

833 if (unlikely(itupsz > BTMaxItemSize))

834 _bt_check_third_page(wstate->index, wstate->heap, isleaf, npage,

835 itup);

836

837 /*

838 * Check to see if current page will fit new item, with space left over to

839 * append a heap TID during suffix truncation when page is a leaf page.

840 *

841 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a

842 * high key with heap TID when finishing off a leaf page, since we rely on

843 * _bt_check_third_page() rejecting oversized non-pivot tuples. On

844 * internal pages we can always fit 3 pivot tuples with larger internal

845 * page tuple limit (includes page high key).

846 *

847 * Most of the time, a page is only "full" in the sense that the soft

848 * fillfactor-wise limit has been exceeded. However, we must always leave

849 * at least two items plus a high key on each page before starting a new

850 * page. Disregard fillfactor and insert on "full" current page if we

851 * don't have the minimum number of items yet. (Note that we deliberately

852 * assume that suffix truncation neither enlarges nor shrinks new high key

853 * when applying soft limit, except when last tuple has a posting list.)

854 */

855 Assert(last_truncextra == 0 || isleaf);

856 if (pgspc < itupsz + (isleaf ? MAXALIGN(sizeof(ItemPointerData)) : 0) ||

857 (pgspc + last_truncextra < state->btps_full && last_off > P_FIRSTKEY))

858 {

859 /*

860 * Finish off the page and write it out.

861 */

862 BulkWriteBuffer obuf = nbuf;

863 Page opage = npage;

864 BlockNumber oblkno = nblkno;

865 ItemId ii;

866 ItemId hii;

867 IndexTuple oitup;

868

869 /* Create new page of same level */

870 nbuf = _bt_blnewpage(wstate, state->btps_level);

871 npage = (Page) nbuf;

872

873 /* and assign it a page position */

874 nblkno = wstate->btws_pages_alloced++;

875

876 /*

877 * We copy the last item on the page into the new page, and then

878 * rearrange the old page so that the 'last item' becomes its high key

879 * rather than a true data item. There had better be at least two

880 * items on the page already, else the page would be empty of useful

881 * data.

882 */

883 Assert(last_off > P_FIRSTKEY);

884 ii = PageGetItemId(opage, last_off);

885 oitup = (IndexTuple) PageGetItem(opage, ii);

886 _bt_sortaddtup(npage, ItemIdGetLength(ii), oitup, P_FIRSTKEY,

887 !isleaf);

888

889 /*

890 * Move 'last' into the high key position on opage. _bt_blnewpage()

891 * allocated empty space for a line pointer when opage was first

892 * created, so this is a matter of rearranging already-allocated space

893 * on page, and initializing high key line pointer. (Actually, leaf

894 * pages must also swap oitup with a truncated version of oitup, which

895 * is sometimes larger than oitup, though never by more than the space

896 * needed to append a heap TID.)

897 */

898 hii = PageGetItemId(opage, P_HIKEY);

899 *hii = *ii;

900 ItemIdSetUnused(ii); /* redundant */

901 ((PageHeader) opage)->pd_lower -= sizeof(ItemIdData);

902

903 if (isleaf)

904 {

905 IndexTuple lastleft;

906 IndexTuple truncated;

907

908 /*

909 * Truncate away any unneeded attributes from high key on leaf

910 * level. This is only done at the leaf level because downlinks

911 * in internal pages are either negative infinity items, or get

912 * their contents from copying from one level down. See also:

913 * _bt_split().

914 *

915 * We don't try to bias our choice of split point to make it more

916 * likely that _bt_truncate() can truncate away more attributes,

917 * whereas the split point used within _bt_split() is chosen much

918 * more delicately. Even still, the lastleft and firstright

919 * tuples passed to _bt_truncate() here are at least not fully

920 * equal to each other when deduplication is used, unless there is

921 * a large group of duplicates (also, unique index builds usually

922 * have few or no spool2 duplicates). When the split point is

923 * between two unequal tuples, _bt_truncate() will avoid including

924 * a heap TID in the new high key, which is the most important

925 * benefit of suffix truncation.

926 *

927 * Overwrite the old item with new truncated high key directly.

928 * oitup is already located at the physical beginning of tuple

929 * space, so this should directly reuse the existing tuple space.

930 */

931 ii = PageGetItemId(opage, OffsetNumberPrev(last_off));

932 lastleft = (IndexTuple) PageGetItem(opage, ii);

933

934 Assert(IndexTupleSize(oitup) > last_truncextra);

935 truncated = _bt_truncate(wstate->index, lastleft, oitup,

936 wstate->inskey);

937 if (!PageIndexTupleOverwrite(opage, P_HIKEY, (Item) truncated,

938 IndexTupleSize(truncated)))

939 elog(ERROR, "failed to add high key to the index page");

940 pfree(truncated);

941

942 /* oitup should continue to point to the page's high key */

943 hii = PageGetItemId(opage, P_HIKEY);

944 oitup = (IndexTuple) PageGetItem(opage, hii);

945 }

946

947 /*

948 * Link the old page into its parent, using its low key. If we don't

949 * have a parent, we have to create one; this adds a new btree level.

950 */

951 if (state->btps_next == NULL)

952 state->btps_next = _bt_pagestate(wstate, state->btps_level + 1);

953

954 Assert((BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) <=

955 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&

956 BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) > 0) ||

957 P_LEFTMOST(BTPageGetOpaque(opage)));

958 Assert(BTreeTupleGetNAtts(state->btps_lowkey, wstate->index) == 0 ||

959 !P_LEFTMOST(BTPageGetOpaque(opage)));

960 BTreeTupleSetDownLink(state->btps_lowkey, oblkno);

961 _bt_buildadd(wstate, state->btps_next, state->btps_lowkey, 0);

962 pfree(state->btps_lowkey);

963

964 /*

965 * Save a copy of the high key from the old page. It is also the low

966 * key for the new page.

967 */

968 state->btps_lowkey = CopyIndexTuple(oitup);

969

970 /*

971 * Set the sibling links for both pages.

972 */

973 {

974 BTPageOpaque oopaque = BTPageGetOpaque(opage);

975 BTPageOpaque nopaque = BTPageGetOpaque(npage);

976

977 oopaque->btpo_next = nblkno;

978 nopaque->btpo_prev = oblkno;

979 nopaque->btpo_next = P_NONE; /* redundant */

980 }

981

982 /*

983 * Write out the old page. _bt_blwritepage takes ownership of the

984 * 'opage' buffer.

985 */

986 _bt_blwritepage(wstate, obuf, oblkno);

987

988 /*

989 * Reset last_off to point to new page

990 */

991 last_off = P_FIRSTKEY;

992 }

993

994 /*

995 * By here, either original page is still the current page, or a new page

996 * was created that became the current page. Either way, the current page

997 * definitely has space for new item.

998 *

999 * If the new item is the first for its page, it must also be the first

1000 * item on its entire level. On later same-level pages, a low key for a

1001 * page will be copied from the prior page in the code above. Generate a

1002 * minus infinity low key here instead.

1003 */

1004 if (last_off == P_HIKEY)

1005 {

1006 Assert(state->btps_lowkey == NULL);

1007 state->btps_lowkey = palloc0(sizeof(IndexTupleData));

1008 state->btps_lowkey->t_info = sizeof(IndexTupleData);

1009 BTreeTupleSetNAtts(state->btps_lowkey, 0, false);

1010 }

1011

1012 /*

1013 * Add the new item into the current page.

1014 */

1015 last_off = OffsetNumberNext(last_off);

1016 _bt_sortaddtup(npage, itupsz, itup, last_off,

1017 !isleaf && last_off == P_FIRSTKEY);

1018

1019 state->btps_buf = nbuf;

1020 state->btps_blkno = nblkno;

1021 state->btps_lastoff = last_off;

1022}

1023

1024/*

1025 * Finalize pending posting list tuple, and add it to the index. Final tuple

1026 * is based on saved base tuple, and saved list of heap TIDs.

1027 *

1028 * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple

1029 * using _bt_buildadd().

1030 */

1031static void

1032 _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state,

1033 BTDedupState dstate)

1034{

1035 Assert(dstate->nitems > 0);

1036

1037 if (dstate->nitems == 1)

1038 _bt_buildadd(wstate, state, dstate->base, 0);

1039 else

1040 {

1041 IndexTuple postingtuple;

1042 Size truncextra;

1043

1044 /* form a tuple with a posting list */

1045 postingtuple = _bt_form_posting(dstate->base,

1046 dstate->htids,

1047 dstate->nhtids);

1048 /* Calculate posting list overhead */

1049 truncextra = IndexTupleSize(postingtuple) -

1050 BTreeTupleGetPostingOffset(postingtuple);

1051

1052 _bt_buildadd(wstate, state, postingtuple, truncextra);

1053 pfree(postingtuple);

1054 }

1055

1056 dstate->nmaxitems = 0;

1057 dstate->nhtids = 0;

1058 dstate->nitems = 0;

1059 dstate->phystupsize = 0;

1060}

1061

1062/*

1063 * Finish writing out the completed btree.

1064 */

1065static void

1066 _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)

1067{

1068 BTPageState *s;

1069 BlockNumber rootblkno = P_NONE;

1070 uint32 rootlevel = 0;

1071 BulkWriteBuffer metabuf;

1072

1073 /*

1074 * Each iteration of this loop completes one more level of the tree.

1075 */

1076 for (s = state; s != NULL; s = s->btps_next)

1077 {

1078 BlockNumber blkno;

1079 BTPageOpaque opaque;

1080

1081 blkno = s->btps_blkno;

1082 opaque = BTPageGetOpaque((Page) s->btps_buf);

1083

1084 /*

1085 * We have to link the last page on this level to somewhere.

1086 *

1087 * If we're at the top, it's the root, so attach it to the metapage.

1088 * Otherwise, add an entry for it to its parent using its low key.

1089 * This may cause the last page of the parent level to split, but

1090 * that's not a problem -- we haven't gotten to it yet.

1091 */

1092 if (s->btps_next == NULL)

1093 {

1094 opaque->btpo_flags |= BTP_ROOT;

1095 rootblkno = blkno;

1096 rootlevel = s->btps_level;

1097 }

1098 else

1099 {

1100 Assert((BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) <=

1101 IndexRelationGetNumberOfKeyAttributes(wstate->index) &&

1102 BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) > 0) ||

1103 P_LEFTMOST(opaque));

1104 Assert(BTreeTupleGetNAtts(s->btps_lowkey, wstate->index) == 0 ||

1105 !P_LEFTMOST(opaque));

1106 BTreeTupleSetDownLink(s->btps_lowkey, blkno);

1107 _bt_buildadd(wstate, s->btps_next, s->btps_lowkey, 0);

1108 pfree(s->btps_lowkey);

1109 s->btps_lowkey = NULL;

1110 }

1111

1112 /*

1113 * This is the rightmost page, so the ItemId array needs to be slid

1114 * back one slot. Then we can dump out the page.

1115 */

1116 _bt_slideleft((Page) s->btps_buf);

1117 _bt_blwritepage(wstate, s->btps_buf, s->btps_blkno);

1118 s->btps_buf = NULL; /* writepage took ownership of the buffer */

1119 }

1120

1121 /*

1122 * As the last step in the process, construct the metapage and make it

1123 * point to the new root (unless we had no data at all, in which case it's

1124 * set to point to "P_NONE"). This changes the index to the "valid" state

1125 * by filling in a valid magic number in the metapage.

1126 */

1127 metabuf = smgr_bulk_get_buf(wstate->bulkstate);

1128 _bt_initmetapage((Page) metabuf, rootblkno, rootlevel,

1129 wstate->inskey->allequalimage);

1130 _bt_blwritepage(wstate, metabuf, BTREE_METAPAGE);

1131}

1132

1133/*

1134 * Read tuples in correct sort order from tuplesort, and load them into

1135 * btree leaves.

1136 */

1137static void

1138 _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)

1139{

1140 BTPageState *state = NULL;

1141 bool merge = (btspool2 != NULL);

1142 IndexTuple itup,

1143 itup2 = NULL;

1144 bool load1;

1145 TupleDesc tupdes = RelationGetDescr(wstate->index);

1146 int i,

1147 keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);

1148 SortSupport sortKeys;

1149 int64 tuples_done = 0;

1150 bool deduplicate;

1151

1152 wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM);

1153

1154 deduplicate = wstate->inskey->allequalimage && !btspool->isunique &&

1155 BTGetDeduplicateItems(wstate->index);

1156

1157 if (merge)

1158 {

1159 /*

1160 * Another BTSpool for dead tuples exists. Now we have to merge

1161 * btspool and btspool2.

1162 */

1163

1164 /* the preparation of merge */

1165 itup = tuplesort_getindextuple(btspool->sortstate, true);

1166 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);

1167

1168 /* Prepare SortSupport data for each column */

1169 sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData));

1170

1171 for (i = 0; i < keysz; i++)

1172 {

1173 SortSupport sortKey = sortKeys + i;

1174 ScanKey scanKey = wstate->inskey->scankeys + i;

1175 bool reverse;

1176

1177 sortKey->ssup_cxt = CurrentMemoryContext;

1178 sortKey->ssup_collation = scanKey->sk_collation;

1179 sortKey->ssup_nulls_first =

1180 (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0;

1181 sortKey->ssup_attno = scanKey->sk_attno;

1182 /* Abbreviation is not supported here */

1183 sortKey->abbreviate = false;

1184

1185 Assert(sortKey->ssup_attno != 0);

1186

1187 reverse = (scanKey->sk_flags & SK_BT_DESC) != 0;

1188

1189 PrepareSortSupportFromIndexRel(wstate->index, reverse, sortKey);

1190 }

1191

1192 for (;;)

1193 {

1194 load1 = true; /* load BTSpool next ? */

1195 if (itup2 == NULL)

1196 {

1197 if (itup == NULL)

1198 break;

1199 }

1200 else if (itup != NULL)

1201 {

1202 int32 compare = 0;

1203

1204 for (i = 1; i <= keysz; i++)

1205 {

1206 SortSupport entry;

1207 Datum attrDatum1,

1208 attrDatum2;

1209 bool isNull1,

1210 isNull2;

1211

1212 entry = sortKeys + i - 1;

1213 attrDatum1 = index_getattr(itup, i, tupdes, &isNull1);

1214 attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2);

1215

1216 compare = ApplySortComparator(attrDatum1, isNull1,

1217 attrDatum2, isNull2,

1218 entry);

1219 if (compare > 0)

1220 {

1221 load1 = false;

1222 break;

1223 }

1224 else if (compare < 0)

1225 break;

1226 }

1227

1228 /*

1229 * If key values are equal, we sort on ItemPointer. This is

1230 * required for btree indexes, since heap TID is treated as an

1231 * implicit last key attribute in order to ensure that all

1232 * keys in the index are physically unique.

1233 */

1234 if (compare == 0)

1235 {

1236 compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid);

1237 Assert(compare != 0);

1238 if (compare > 0)

1239 load1 = false;

1240 }

1241 }

1242 else

1243 load1 = false;

1244

1245 /* When we see first tuple, create first index page */

1246 if (state == NULL)

1247 state = _bt_pagestate(wstate, 0);

1248

1249 if (load1)

1250 {

1251 _bt_buildadd(wstate, state, itup, 0);

1252 itup = tuplesort_getindextuple(btspool->sortstate, true);

1253 }

1254 else

1255 {

1256 _bt_buildadd(wstate, state, itup2, 0);

1257 itup2 = tuplesort_getindextuple(btspool2->sortstate, true);

1258 }

1259

1260 /* Report progress */

1261 pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,

1262 ++tuples_done);

1263 }

1264 pfree(sortKeys);

1265 }

1266 else if (deduplicate)

1267 {

1268 /* merge is unnecessary, deduplicate into posting lists */

1269 BTDedupState dstate;

1270

1271 dstate = (BTDedupState) palloc(sizeof(BTDedupStateData));

1272 dstate->deduplicate = true; /* unused */

1273 dstate->nmaxitems = 0; /* unused */

1274 dstate->maxpostingsize = 0; /* set later */

1275 /* Metadata about base tuple of current pending posting list */

1276 dstate->base = NULL;

1277 dstate->baseoff = InvalidOffsetNumber; /* unused */

1278 dstate->basetupsize = 0;

1279 /* Metadata about current pending posting list TIDs */

1280 dstate->htids = NULL;

1281 dstate->nhtids = 0;

1282 dstate->nitems = 0;

1283 dstate->phystupsize = 0; /* unused */

1284 dstate->nintervals = 0; /* unused */

1285

1286 while ((itup = tuplesort_getindextuple(btspool->sortstate,

1287 true)) != NULL)

1288 {

1289 /* When we see first tuple, create first index page */

1290 if (state == NULL)

1291 {

1292 state = _bt_pagestate(wstate, 0);

1293

1294 /*

1295 * Limit size of posting list tuples to 1/10 space we want to

1296 * leave behind on the page, plus space for final item's line

1297 * pointer. This is equal to the space that we'd like to

1298 * leave behind on each leaf page when fillfactor is 90,

1299 * allowing us to get close to fillfactor% space utilization

1300 * when there happen to be a great many duplicates. (This

1301 * makes higher leaf fillfactor settings ineffective when

1302 * building indexes that have many duplicates, but packing

1303 * leaf pages full with few very large tuples doesn't seem

1304 * like a useful goal.)

1305 */

1306 dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) -

1307 sizeof(ItemIdData);

1308 Assert(dstate->maxpostingsize <= BTMaxItemSize &&

1309 dstate->maxpostingsize <= INDEX_SIZE_MASK);

1310 dstate->htids = palloc(dstate->maxpostingsize);

1311

1312 /* start new pending posting list with itup copy */

1313 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),

1314 InvalidOffsetNumber);

1315 }

1316 else if (_bt_keep_natts_fast(wstate->index, dstate->base,

1317 itup) > keysz &&

1318 _bt_dedup_save_htid(dstate, itup))

1319 {

1320 /*

1321 * Tuple is equal to base tuple of pending posting list. Heap

1322 * TID from itup has been saved in state.

1323 */

1324 }

1325 else

1326 {

1327 /*

1328 * Tuple is not equal to pending posting list tuple, or

1329 * _bt_dedup_save_htid() opted to not merge current item into

1330 * pending posting list.

1331 */

1332 _bt_sort_dedup_finish_pending(wstate, state, dstate);

1333 pfree(dstate->base);

1334

1335 /* start new pending posting list with itup copy */

1336 _bt_dedup_start_pending(dstate, CopyIndexTuple(itup),

1337 InvalidOffsetNumber);

1338 }

1339

1340 /* Report progress */

1341 pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,

1342 ++tuples_done);

1343 }

1344

1345 if (state)

1346 {

1347 /*

1348 * Handle the last item (there must be a last item when the

1349 * tuplesort returned one or more tuples)

1350 */

1351 _bt_sort_dedup_finish_pending(wstate, state, dstate);

1352 pfree(dstate->base);

1353 pfree(dstate->htids);

1354 }

1355

1356 pfree(dstate);

1357 }

1358 else

1359 {

1360 /* merging and deduplication are both unnecessary */

1361 while ((itup = tuplesort_getindextuple(btspool->sortstate,

1362 true)) != NULL)

1363 {

1364 /* When we see first tuple, create first index page */

1365 if (state == NULL)

1366 state = _bt_pagestate(wstate, 0);

1367

1368 _bt_buildadd(wstate, state, itup, 0);

1369

1370 /* Report progress */

1371 pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,

1372 ++tuples_done);

1373 }

1374 }

1375

1376 /* Close down final pages and write the metapage */

1377 _bt_uppershutdown(wstate, state);

1378 smgr_bulk_finish(wstate->bulkstate);

1379}

1380

1381/*

1382 * Create parallel context, and launch workers for leader.

1383 *

1384 * buildstate argument should be initialized (with the exception of the

1385 * tuplesort state in spools, which may later be created based on shared

1386 * state initially set up here).

1387 *

1388 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.

1389 *

1390 * request is the target number of parallel worker processes to launch.

1391 *

1392 * Sets buildstate's BTLeader, which caller must use to shut down parallel

1393 * mode by passing it to _bt_end_parallel() at the very end of its index

1394 * build. If not even a single worker process can be launched, this is

1395 * never set, and caller should proceed with a serial index build.

1396 */

1397static void

1398 _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)

1399{

1400 ParallelContext *pcxt;

1401 int scantuplesortstates;

1402 Snapshot snapshot;

1403 Size estbtshared;

1404 Size estsort;

1405 BTShared *btshared;

1406 Sharedsort *sharedsort;

1407 Sharedsort *sharedsort2;

1408 BTSpool *btspool = buildstate->spool;

1409 BTLeader *btleader = (BTLeader *) palloc0(sizeof(BTLeader));

1410 WalUsage *walusage;

1411 BufferUsage *bufferusage;

1412 bool leaderparticipates = true;

1413 int querylen;

1414

1415#ifdef DISABLE_LEADER_PARTICIPATION

1416 leaderparticipates = false;

1417#endif

1418

1419 /*

1420 * Enter parallel mode, and create context for parallel build of btree

1421 * index

1422 */

1423 EnterParallelMode();

1424 Assert(request > 0);

1425 pcxt = CreateParallelContext("postgres", "_bt_parallel_build_main",

1426 request);

1427

1428 scantuplesortstates = leaderparticipates ? request + 1 : request;

1429

1430 /*

1431 * Prepare for scan of the base relation. In a normal index build, we use

1432 * SnapshotAny because we must retrieve all tuples and do our own time

1433 * qual checks (because we have to index RECENTLY_DEAD tuples). In a

1434 * concurrent build, we take a regular MVCC snapshot and index whatever's

1435 * live according to that.

1436 */

1437 if (!isconcurrent)

1438 snapshot = SnapshotAny;

1439 else

1440 snapshot = RegisterSnapshot(GetTransactionSnapshot());

1441

1442 /*

1443 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and

1444 * PARALLEL_KEY_TUPLESORT tuplesort workspace

1445 */

1446 estbtshared = _bt_parallel_estimate_shared(btspool->heap, snapshot);

1447 shm_toc_estimate_chunk(&pcxt->estimator, estbtshared);

1448 estsort = tuplesort_estimate_shared(scantuplesortstates);

1449 shm_toc_estimate_chunk(&pcxt->estimator, estsort);

1450

1451 /*

1452 * Unique case requires a second spool, and so we may have to account for

1453 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2

1454 */

1455 if (!btspool->isunique)

1456 shm_toc_estimate_keys(&pcxt->estimator, 2);

1457 else

1458 {

1459 shm_toc_estimate_chunk(&pcxt->estimator, estsort);

1460 shm_toc_estimate_keys(&pcxt->estimator, 3);

1461 }

1462

1463 /*

1464 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE

1465 * and PARALLEL_KEY_BUFFER_USAGE.

1466 *

1467 * If there are no extensions loaded that care, we could skip this. We

1468 * have no way of knowing whether anyone's looking at pgWalUsage or

1469 * pgBufferUsage, so do it unconditionally.

1470 */

1471 shm_toc_estimate_chunk(&pcxt->estimator,

1472 mul_size(sizeof(WalUsage), pcxt->nworkers));

1473 shm_toc_estimate_keys(&pcxt->estimator, 1);

1474 shm_toc_estimate_chunk(&pcxt->estimator,

1475 mul_size(sizeof(BufferUsage), pcxt->nworkers));

1476 shm_toc_estimate_keys(&pcxt->estimator, 1);

1477

1478 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */

1479 if (debug_query_string)

1480 {

1481 querylen = strlen(debug_query_string);

1482 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);

1483 shm_toc_estimate_keys(&pcxt->estimator, 1);

1484 }

1485 else

1486 querylen = 0; /* keep compiler quiet */

1487

1488 /* Everyone's had a chance to ask for space, so now create the DSM */

1489 InitializeParallelDSM(pcxt);

1490

1491 /* If no DSM segment was available, back out (do serial build) */

1492 if (pcxt->seg == NULL)

1493 {

1494 if (IsMVCCSnapshot(snapshot))

1495 UnregisterSnapshot(snapshot);

1496 DestroyParallelContext(pcxt);

1497 ExitParallelMode();

1498 return;

1499 }

1500

1501 /* Store shared build state, for which we reserved space */

1502 btshared = (BTShared *) shm_toc_allocate(pcxt->toc, estbtshared);

1503 /* Initialize immutable state */

1504 btshared->heaprelid = RelationGetRelid(btspool->heap);

1505 btshared->indexrelid = RelationGetRelid(btspool->index);

1506 btshared->isunique = btspool->isunique;

1507 btshared->nulls_not_distinct = btspool->nulls_not_distinct;

1508 btshared->isconcurrent = isconcurrent;

1509 btshared->scantuplesortstates = scantuplesortstates;

1510 btshared->queryid = pgstat_get_my_query_id();

1511 ConditionVariableInit(&btshared->workersdonecv);

1512 SpinLockInit(&btshared->mutex);

1513 /* Initialize mutable state */

1514 btshared->nparticipantsdone = 0;

1515 btshared->reltuples = 0.0;

1516 btshared->havedead = false;

1517 btshared->indtuples = 0.0;

1518 btshared->brokenhotchain = false;

1519 table_parallelscan_initialize(btspool->heap,

1520 ParallelTableScanFromBTShared(btshared),

1521 snapshot);

1522

1523 /*

1524 * Store shared tuplesort-private state, for which we reserved space.

1525 * Then, initialize opaque state using tuplesort routine.

1526 */

1527 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);

1528 tuplesort_initialize_shared(sharedsort, scantuplesortstates,

1529 pcxt->seg);

1530

1531 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BTREE_SHARED, btshared);

1532 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);

1533

1534 /* Unique case requires a second spool, and associated shared state */

1535 if (!btspool->isunique)

1536 sharedsort2 = NULL;

1537 else

1538 {

1539 /*

1540 * Store additional shared tuplesort-private state, for which we

1541 * reserved space. Then, initialize opaque state using tuplesort

1542 * routine.

1543 */

1544 sharedsort2 = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);

1545 tuplesort_initialize_shared(sharedsort2, scantuplesortstates,

1546 pcxt->seg);

1547

1548 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT_SPOOL2, sharedsort2);

1549 }

1550

1551 /* Store query string for workers */

1552 if (debug_query_string)

1553 {

1554 char *sharedquery;

1555

1556 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);

1557 memcpy(sharedquery, debug_query_string, querylen + 1);

1558 shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);

1559 }

1560

1561 /*

1562 * Allocate space for each worker's WalUsage and BufferUsage; no need to

1563 * initialize.

1564 */

1565 walusage = shm_toc_allocate(pcxt->toc,

1566 mul_size(sizeof(WalUsage), pcxt->nworkers));

1567 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);

1568 bufferusage = shm_toc_allocate(pcxt->toc,

1569 mul_size(sizeof(BufferUsage), pcxt->nworkers));

1570 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);

1571

1572 /* Launch workers, saving status for leader/caller */

1573 LaunchParallelWorkers(pcxt);

1574 btleader->pcxt = pcxt;

1575 btleader->nparticipanttuplesorts = pcxt->nworkers_launched;

1576 if (leaderparticipates)

1577 btleader->nparticipanttuplesorts++;

1578 btleader->btshared = btshared;

1579 btleader->sharedsort = sharedsort;

1580 btleader->sharedsort2 = sharedsort2;

1581 btleader->snapshot = snapshot;

1582 btleader->walusage = walusage;

1583 btleader->bufferusage = bufferusage;

1584

1585 /* If no workers were successfully launched, back out (do serial build) */

1586 if (pcxt->nworkers_launched == 0)

1587 {

1588 _bt_end_parallel(btleader);

1589 return;

1590 }

1591

1592 /* Save leader state now that it's clear build will be parallel */

1593 buildstate->btleader = btleader;

1594

1595 /* Join heap scan ourselves */

1596 if (leaderparticipates)

1597 _bt_leader_participate_as_worker(buildstate);

1598

1599 /*

1600 * Caller needs to wait for all launched workers when we return. Make

1601 * sure that the failure-to-start case will not hang forever.

1602 */

1603 WaitForParallelWorkersToAttach(pcxt);

1604}

1605

1606/*

1607 * Shut down workers, destroy parallel context, and end parallel mode.

1608 */

1609static void

1610 _bt_end_parallel(BTLeader *btleader)

1611{

1612 int i;

1613

1614 /* Shutdown worker processes */

1615 WaitForParallelWorkersToFinish(btleader->pcxt);

1616

1617 /*

1618 * Next, accumulate WAL usage. (This must wait for the workers to finish,

1619 * or we might get incomplete data.)

1620 */

1621 for (i = 0; i < btleader->pcxt->nworkers_launched; i++)

1622 InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]);

1623

1624 /* Free last reference to MVCC snapshot, if one was used */

1625 if (IsMVCCSnapshot(btleader->snapshot))

1626 UnregisterSnapshot(btleader->snapshot);

1627 DestroyParallelContext(btleader->pcxt);

1628 ExitParallelMode();

1629}

1630

1631/*

1632 * Returns size of shared memory required to store state for a parallel

1633 * btree index build based on the snapshot its parallel scan will use.

1634 */

1635static Size

1636 _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)

1637{

1638 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */

1639 return add_size(BUFFERALIGN(sizeof(BTShared)),

1640 table_parallelscan_estimate(heap, snapshot));

1641}

1642

1643/*

1644 * Within leader, wait for end of heap scan.

1645 *

1646 * When called, parallel heap scan started by _bt_begin_parallel() will

1647 * already be underway within worker processes (when leader participates

1648 * as a worker, we should end up here just as workers are finishing).

1649 *

1650 * Fills in fields needed for ambuild statistics, and lets caller set

1651 * field indicating that some worker encountered a broken HOT chain.

1652 *

1653 * Returns the total number of heap tuples scanned.

1654 */

1655static double

1656 _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)

1657{

1658 BTShared *btshared = buildstate->btleader->btshared;

1659 int nparticipanttuplesorts;

1660 double reltuples;

1661

1662 nparticipanttuplesorts = buildstate->btleader->nparticipanttuplesorts;

1663 for (;;)

1664 {

1665 SpinLockAcquire(&btshared->mutex);

1666 if (btshared->nparticipantsdone == nparticipanttuplesorts)

1667 {

1668 buildstate->havedead = btshared->havedead;

1669 buildstate->indtuples = btshared->indtuples;

1670 *brokenhotchain = btshared->brokenhotchain;

1671 reltuples = btshared->reltuples;

1672 SpinLockRelease(&btshared->mutex);

1673 break;

1674 }

1675 SpinLockRelease(&btshared->mutex);

1676

1677 ConditionVariableSleep(&btshared->workersdonecv,

1678 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);

1679 }

1680

1681 ConditionVariableCancelSleep();

1682

1683 return reltuples;

1684}

1685

1686/*

1687 * Within leader, participate as a parallel worker.

1688 */

1689static void

1690 _bt_leader_participate_as_worker(BTBuildState *buildstate)

1691{

1692 BTLeader *btleader = buildstate->btleader;

1693 BTSpool *leaderworker;

1694 BTSpool *leaderworker2;

1695 int sortmem;

1696

1697 /* Allocate memory and initialize private spool */

1698 leaderworker = (BTSpool *) palloc0(sizeof(BTSpool));

1699 leaderworker->heap = buildstate->spool->heap;

1700 leaderworker->index = buildstate->spool->index;

1701 leaderworker->isunique = buildstate->spool->isunique;

1702 leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct;

1703

1704 /* Initialize second spool, if required */

1705 if (!btleader->btshared->isunique)

1706 leaderworker2 = NULL;

1707 else

1708 {

1709 /* Allocate memory for worker's own private secondary spool */

1710 leaderworker2 = (BTSpool *) palloc0(sizeof(BTSpool));

1711

1712 /* Initialize worker's own secondary spool */

1713 leaderworker2->heap = leaderworker->heap;

1714 leaderworker2->index = leaderworker->index;

1715 leaderworker2->isunique = false;

1716 }

1717

1718 /*

1719 * Might as well use reliable figure when doling out maintenance_work_mem

1720 * (when requested number of workers were not launched, this will be

1721 * somewhat higher than it is for other workers).

1722 */

1723 sortmem = maintenance_work_mem / btleader->nparticipanttuplesorts;

1724

1725 /* Perform work common to all participants */

1726 _bt_parallel_scan_and_sort(leaderworker, leaderworker2, btleader->btshared,

1727 btleader->sharedsort, btleader->sharedsort2,

1728 sortmem, true);

1729

1730#ifdef BTREE_BUILD_STATS

1731 if (log_btree_build_stats)

1732 {

1733 ShowUsage("BTREE BUILD (Leader Partial Spool) STATISTICS");

1734 ResetUsage();

1735 }

1736#endif /* BTREE_BUILD_STATS */

1737}

1738

1739/*

1740 * Perform work within a launched parallel process.

1741 */

1742void

1743 _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)

1744{

1745 char *sharedquery;

1746 BTSpool *btspool;

1747 BTSpool *btspool2;

1748 BTShared *btshared;

1749 Sharedsort *sharedsort;

1750 Sharedsort *sharedsort2;

1751 Relation heapRel;

1752 Relation indexRel;

1753 LOCKMODE heapLockmode;

1754 LOCKMODE indexLockmode;

1755 WalUsage *walusage;

1756 BufferUsage *bufferusage;

1757 int sortmem;

1758

1759#ifdef BTREE_BUILD_STATS

1760 if (log_btree_build_stats)

1761 ResetUsage();

1762#endif /* BTREE_BUILD_STATS */

1763

1764 /*

1765 * The only possible status flag that can be set to the parallel worker is

1766 * PROC_IN_SAFE_IC.

1767 */

1768 Assert((MyProc->statusFlags == 0) ||

1769 (MyProc->statusFlags == PROC_IN_SAFE_IC));

1770

1771 /* Set debug_query_string for individual workers first */

1772 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);

1773 debug_query_string = sharedquery;

1774

1775 /* Report the query string from leader */

1776 pgstat_report_activity(STATE_RUNNING, debug_query_string);

1777

1778 /* Look up nbtree shared state */

1779 btshared = shm_toc_lookup(toc, PARALLEL_KEY_BTREE_SHARED, false);

1780

1781 /* Open relations using lock modes known to be obtained by index.c */

1782 if (!btshared->isconcurrent)

1783 {

1784 heapLockmode = ShareLock;

1785 indexLockmode = AccessExclusiveLock;

1786 }

1787 else

1788 {

1789 heapLockmode = ShareUpdateExclusiveLock;

1790 indexLockmode = RowExclusiveLock;

1791 }

1792

1793 /* Track query ID */

1794 pgstat_report_query_id(btshared->queryid, false);

1795

1796 /* Open relations within worker */

1797 heapRel = table_open(btshared->heaprelid, heapLockmode);

1798 indexRel = index_open(btshared->indexrelid, indexLockmode);

1799

1800 /* Initialize worker's own spool */

1801 btspool = (BTSpool *) palloc0(sizeof(BTSpool));

1802 btspool->heap = heapRel;

1803 btspool->index = indexRel;

1804 btspool->isunique = btshared->isunique;

1805 btspool->nulls_not_distinct = btshared->nulls_not_distinct;

1806

1807 /* Look up shared state private to tuplesort.c */

1808 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);

1809 tuplesort_attach_shared(sharedsort, seg);

1810 if (!btshared->isunique)

1811 {

1812 btspool2 = NULL;

1813 sharedsort2 = NULL;

1814 }

1815 else

1816 {

1817 /* Allocate memory for worker's own private secondary spool */

1818 btspool2 = (BTSpool *) palloc0(sizeof(BTSpool));

1819

1820 /* Initialize worker's own secondary spool */

1821 btspool2->heap = btspool->heap;

1822 btspool2->index = btspool->index;

1823 btspool2->isunique = false;

1824 /* Look up shared state private to tuplesort.c */

1825 sharedsort2 = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT_SPOOL2, false);

1826 tuplesort_attach_shared(sharedsort2, seg);

1827 }

1828

1829 /* Prepare to track buffer usage during parallel execution */

1830 InstrStartParallelQuery();

1831

1832 /* Perform sorting of spool, and possibly a spool2 */

1833 sortmem = maintenance_work_mem / btshared->scantuplesortstates;

1834 _bt_parallel_scan_and_sort(btspool, btspool2, btshared, sharedsort,

1835 sharedsort2, sortmem, false);

1836

1837 /* Report WAL/buffer usage during parallel execution */

1838 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);

1839 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);

1840 InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],

1841 &walusage[ParallelWorkerNumber]);

1842

1843#ifdef BTREE_BUILD_STATS

1844 if (log_btree_build_stats)

1845 {

1846 ShowUsage("BTREE BUILD (Worker Partial Spool) STATISTICS");

1847 ResetUsage();

1848 }

1849#endif /* BTREE_BUILD_STATS */

1850

1851 index_close(indexRel, indexLockmode);

1852 table_close(heapRel, heapLockmode);

1853}

1854

1855/*

1856 * Perform a worker's portion of a parallel sort.

1857 *

1858 * This generates a tuplesort for passed btspool, and a second tuplesort

1859 * state if a second btspool is need (i.e. for unique index builds). All

1860 * other spool fields should already be set when this is called.

1861 *

1862 * sortmem is the amount of working memory to use within each worker,

1863 * expressed in KBs.

1864 *

1865 * When this returns, workers are done, and need only release resources.

1866 */

1867static void

1868 _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2,

1869 BTShared *btshared, Sharedsort *sharedsort,

1870 Sharedsort *sharedsort2, int sortmem, bool progress)

1871{

1872 SortCoordinate coordinate;

1873 BTBuildState buildstate;

1874 TableScanDesc scan;

1875 double reltuples;

1876 IndexInfo *indexInfo;

1877

1878 /* Initialize local tuplesort coordination state */

1879 coordinate = palloc0(sizeof(SortCoordinateData));

1880 coordinate->isWorker = true;

1881 coordinate->nParticipants = -1;

1882 coordinate->sharedsort = sharedsort;

1883

1884 /* Begin "partial" tuplesort */

1885 btspool->sortstate = tuplesort_begin_index_btree(btspool->heap,

1886 btspool->index,

1887 btspool->isunique,

1888 btspool->nulls_not_distinct,

1889 sortmem, coordinate,

1890 TUPLESORT_NONE);

1891

1892 /*

1893 * Just as with serial case, there may be a second spool. If so, a

1894 * second, dedicated spool2 partial tuplesort is required.

1895 */

1896 if (btspool2)

1897 {

1898 SortCoordinate coordinate2;

1899

1900 /*

1901 * We expect that the second one (for dead tuples) won't get very

1902 * full, so we give it only work_mem (unless sortmem is less for

1903 * worker). Worker processes are generally permitted to allocate

1904 * work_mem independently.

1905 */

1906 coordinate2 = palloc0(sizeof(SortCoordinateData));

1907 coordinate2->isWorker = true;

1908 coordinate2->nParticipants = -1;

1909 coordinate2->sharedsort = sharedsort2;

1910 btspool2->sortstate =

1911 tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false,

1912 Min(sortmem, work_mem), coordinate2,

1913 false);

1914 }

1915

1916 /* Fill in buildstate for _bt_build_callback() */

1917 buildstate.isunique = btshared->isunique;

1918 buildstate.nulls_not_distinct = btshared->nulls_not_distinct;

1919 buildstate.havedead = false;

1920 buildstate.heap = btspool->heap;

1921 buildstate.spool = btspool;

1922 buildstate.spool2 = btspool2;

1923 buildstate.indtuples = 0;

1924 buildstate.btleader = NULL;

1925

1926 /* Join parallel scan */

1927 indexInfo = BuildIndexInfo(btspool->index);

1928 indexInfo->ii_Concurrent = btshared->isconcurrent;

1929 scan = table_beginscan_parallel(btspool->heap,

1930 ParallelTableScanFromBTShared(btshared));

1931 reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo,

1932 true, progress, _bt_build_callback,

1933 &buildstate, scan);

1934

1935 /* Execute this worker's part of the sort */

1936 if (progress)

1937 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,

1938 PROGRESS_BTREE_PHASE_PERFORMSORT_1);

1939 tuplesort_performsort(btspool->sortstate);

1940 if (btspool2)

1941 {

1942 if (progress)

1943 pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE,

1944 PROGRESS_BTREE_PHASE_PERFORMSORT_2);

1945 tuplesort_performsort(btspool2->sortstate);

1946 }

1947

1948 /*

1949 * Done. Record ambuild statistics, and whether we encountered a broken

1950 * HOT chain.

1951 */

1952 SpinLockAcquire(&btshared->mutex);

1953 btshared->nparticipantsdone++;

1954 btshared->reltuples += reltuples;

1955 if (buildstate.havedead)

1956 btshared->havedead = true;

1957 btshared->indtuples += buildstate.indtuples;

1958 if (indexInfo->ii_BrokenHotChain)

1959 btshared->brokenhotchain = true;

1960 SpinLockRelease(&btshared->mutex);

1961

1962 /* Notify leader */

1963 ConditionVariableSignal(&btshared->workersdonecv);

1964

1965 /* We can end tuplesorts immediately */

1966 tuplesort_end(btspool->sortstate);

1967 if (btspool2)

1968 tuplesort_end(btspool2->sortstate);

1969}

ParallelWorkerNumber

int ParallelWorkerNumber

Definition: parallel.c:115

InitializeParallelDSM

void InitializeParallelDSM(ParallelContext *pcxt)

Definition: parallel.c:211

WaitForParallelWorkersToFinish

void WaitForParallelWorkersToFinish(ParallelContext *pcxt)

Definition: parallel.c:796

LaunchParallelWorkers

void LaunchParallelWorkers(ParallelContext *pcxt)

Definition: parallel.c:573

DestroyParallelContext

void DestroyParallelContext(ParallelContext *pcxt)

Definition: parallel.c:950

CreateParallelContext

ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)

Definition: parallel.c:173

WaitForParallelWorkersToAttach

void WaitForParallelWorkersToAttach(ParallelContext *pcxt)

Definition: parallel.c:693

pgstat_progress_update_param

void pgstat_progress_update_param(int index, int64 val)

Definition: backend_progress.c:48

pgstat_progress_update_multi_param

void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)

Definition: backend_progress.c:121

pgstat_report_query_id

void pgstat_report_query_id(int64 query_id, bool force)

Definition: backend_status.c:686

pgstat_get_my_query_id

int64 pgstat_get_my_query_id(void)

Definition: backend_status.c:1138

pgstat_report_activity

void pgstat_report_activity(BackendState state, const char *cmd_str)

Definition: backend_status.c:572

STATE_RUNNING

@ STATE_RUNNING

Definition: backend_status.h:29

BlockNumber

uint32 BlockNumber

Definition: block.h:31

values

static Datum values[MAXATTR]

Definition: bootstrap.c:153

RelationGetNumberOfBlocks

#define RelationGetNumberOfBlocks(reln)

Definition: bufmgr.h:283

PageGetFreeSpace

Size PageGetFreeSpace(const PageData *page)

Definition: bufpage.c:906

PageIndexTupleOverwrite

bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)

Definition: bufpage.c:1404

PageHeader

PageHeaderData * PageHeader

Definition: bufpage.h:174

PageGetItem

static Item PageGetItem(const PageData *page, const ItemIdData *itemId)

Definition: bufpage.h:354

PageGetItemId

static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)

Definition: bufpage.h:244

Page

PageData * Page

Definition: bufpage.h:82

PageAddItem

#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)

Definition: bufpage.h:472

PageGetMaxOffsetNumber

static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)

Definition: bufpage.h:372

smgr_bulk_start_rel

BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)

Definition: bulk_write.c:87

smgr_bulk_write

void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)

Definition: bulk_write.c:323

smgr_bulk_get_buf

BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)

Definition: bulk_write.c:347

smgr_bulk_finish

void smgr_bulk_finish(BulkWriteState *bulkstate)

Definition: bulk_write.c:130

bulk_write.h

MAXALIGN_DOWN

#define MAXALIGN_DOWN(LEN)

Definition: c.h:822

Min

#define Min(x, y)

Definition: c.h:1003

MAXALIGN

#define MAXALIGN(LEN)

Definition: c.h:810

BUFFERALIGN

#define BUFFERALIGN(LEN)

Definition: c.h:812

int64

int64_t int64

Definition: c.h:535

int32

int32_t int32

Definition: c.h:534

unlikely

#define unlikely(x)

Definition: c.h:402

uint32

uint32_t uint32

Definition: c.h:538

Size

size_t Size

Definition: c.h:610

ConditionVariableCancelSleep

bool ConditionVariableCancelSleep(void)

Definition: condition_variable.c:230

ConditionVariableInit

void ConditionVariableInit(ConditionVariable *cv)

Definition: condition_variable.c:35

ConditionVariableSleep

void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)

Definition: condition_variable.c:96

ConditionVariableSignal

void ConditionVariableSignal(ConditionVariable *cv)

Definition: condition_variable.c:259

ERROR

#define ERROR

Definition: elog.h:39

elog

#define elog(elevel,...)

Definition: elog.h:226

compare

static int compare(const void *arg1, const void *arg2)

Definition: geqo_pool.c:145

maintenance_work_mem

int maintenance_work_mem

Definition: globals.c:133

work_mem

int work_mem

Definition: globals.c:131

log_btree_build_stats

bool log_btree_build_stats

Definition: guc_tables.c:525

Assert

Assert(PointerIsAligned(start, uint64))

parallel.h

BuildIndexInfo

IndexInfo * BuildIndexInfo(Relation index)

Definition: index.c:2428

index.h

index_close

void index_close(Relation relation, LOCKMODE lockmode)

Definition: indexam.c:177

index_open

Relation index_open(Oid relationId, LOCKMODE lockmode)

Definition: indexam.c:133

CopyIndexTuple

IndexTuple CopyIndexTuple(IndexTuple source)

Definition: indextuple.c:547

InstrAccumParallelQuery

void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)

Definition: instrument.c:218

InstrEndParallelQuery

void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)

Definition: instrument.c:208

InstrStartParallelQuery

void InstrStartParallelQuery(void)

Definition: instrument.c:200

instrument.h

i

int i

Definition: isn.c:77

Item

Pointer Item

Definition: item.h:17

ItemIdGetLength

#define ItemIdGetLength(itemId)

Definition: itemid.h:59

ItemIdData

struct ItemIdData ItemIdData

ItemIdSetUnused

#define ItemIdSetUnused(itemId)

Definition: itemid.h:128

ItemPointerCompare

int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)

Definition: itemptr.c:51

IndexTuple

IndexTupleData * IndexTuple

Definition: itup.h:53

index_getattr

static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)

Definition: itup.h:131

IndexTupleData

struct IndexTupleData IndexTupleData

IndexTupleSize

static Size IndexTupleSize(const IndexTupleData *itup)

Definition: itup.h:71

INDEX_SIZE_MASK

#define INDEX_SIZE_MASK

Definition: itup.h:65

LOCKMODE

int LOCKMODE

Definition: lockdefs.h:26

AccessExclusiveLock

#define AccessExclusiveLock

Definition: lockdefs.h:43

ShareUpdateExclusiveLock

#define ShareUpdateExclusiveLock

Definition: lockdefs.h:39

ShareLock

#define ShareLock

Definition: lockdefs.h:40

RowExclusiveLock

#define RowExclusiveLock

Definition: lockdefs.h:38

pfree

void pfree(void *pointer)

Definition: mcxt.c:1594

palloc0

void * palloc0(Size size)

Definition: mcxt.c:1395

palloc

void * palloc(Size size)

Definition: mcxt.c:1365

CurrentMemoryContext

MemoryContext CurrentMemoryContext

Definition: mcxt.c:160

miscadmin.h

CHECK_FOR_INTERRUPTS

#define CHECK_FOR_INTERRUPTS()

Definition: miscadmin.h:122

_bt_dedup_save_htid

bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)

Definition: nbtdedup.c:485

_bt_dedup_start_pending

void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)

Definition: nbtdedup.c:434

_bt_form_posting

IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)

Definition: nbtdedup.c:865

_bt_pageinit

void _bt_pageinit(Page page, Size size)

Definition: nbtpage.c:1129

_bt_initmetapage

void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)

Definition: nbtpage.c:67

nbtree.h

BTGetDeduplicateItems

#define BTGetDeduplicateItems(relation)

Definition: nbtree.h:1166

BTGetTargetPageFreeSpace

#define BTGetTargetPageFreeSpace(relation)

Definition: nbtree.h:1164

BTP_LEAF

#define BTP_LEAF

Definition: nbtree.h:77

P_HIKEY

#define P_HIKEY

Definition: nbtree.h:368

PROGRESS_BTREE_PHASE_PERFORMSORT_2

#define PROGRESS_BTREE_PHASE_PERFORMSORT_2

Definition: nbtree.h:1179

PROGRESS_BTREE_PHASE_LEAF_LOAD

#define PROGRESS_BTREE_PHASE_LEAF_LOAD

Definition: nbtree.h:1180

P_LEFTMOST

#define P_LEFTMOST(opaque)

Definition: nbtree.h:219

BTPageGetOpaque

#define BTPageGetOpaque(page)

Definition: nbtree.h:74

BTP_ROOT

#define BTP_ROOT

Definition: nbtree.h:78

BTreeTupleSetDownLink

static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)

Definition: nbtree.h:563

PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN

Definition: nbtree.h:1177

PROGRESS_BTREE_PHASE_PERFORMSORT_1

#define PROGRESS_BTREE_PHASE_PERFORMSORT_1

Definition: nbtree.h:1178

BTreeTupleGetPostingOffset

static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)

Definition: nbtree.h:530

P_NONE

#define P_NONE

Definition: nbtree.h:213

SK_BT_NULLS_FIRST

#define SK_BT_NULLS_FIRST

Definition: nbtree.h:1148

BTREE_METAPAGE

#define BTREE_METAPAGE

Definition: nbtree.h:149

SK_BT_DESC

#define SK_BT_DESC

Definition: nbtree.h:1147

BTDedupState

BTDedupStateData * BTDedupState

Definition: nbtree.h:904

P_FIRSTKEY

#define P_FIRSTKEY

Definition: nbtree.h:369

BTreeTupleSetNAtts

static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)

Definition: nbtree.h:596

BTMaxItemSize

#define BTMaxItemSize

Definition: nbtree.h:165

BTreeTupleGetNAtts

#define BTreeTupleGetNAtts(itup, rel)

Definition: nbtree.h:578

BTREE_NONLEAF_FILLFACTOR

#define BTREE_NONLEAF_FILLFACTOR

Definition: nbtree.h:202

PARALLEL_KEY_BUFFER_USAGE

#define PARALLEL_KEY_BUFFER_USAGE

Definition: nbtsort.c:67

ParallelTableScanFromBTShared

#define ParallelTableScanFromBTShared(shared)

Definition: nbtsort.c:162

_bt_blwritepage

static void _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)

Definition: nbtsort.c:638

_bt_sortaddtup

static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)

Definition: nbtsort.c:717

_bt_slideleft

static void _bt_slideleft(Page rightmostpage)

Definition: nbtsort.c:686

_bt_pagestate

static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)

Definition: nbtsort.c:649

_bt_load

static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)

Definition: nbtsort.c:1138

_bt_end_parallel

static void _bt_end_parallel(BTLeader *btleader)

Definition: nbtsort.c:1610

PARALLEL_KEY_TUPLESORT_SPOOL2

#define PARALLEL_KEY_TUPLESORT_SPOOL2

Definition: nbtsort.c:64

_bt_parallel_scan_and_sort

static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)

Definition: nbtsort.c:1868

_bt_parallel_estimate_shared

static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)

Definition: nbtsort.c:1636

BTPageState

struct BTPageState BTPageState

_bt_sort_dedup_finish_pending

static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate)

Definition: nbtsort.c:1032

BTSpool

struct BTSpool BTSpool

_bt_parallel_heapscan

static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)

Definition: nbtsort.c:1656

_bt_leafbuild

static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)

Definition: nbtsort.c:539

PARALLEL_KEY_BTREE_SHARED

#define PARALLEL_KEY_BTREE_SHARED

Definition: nbtsort.c:62

btbuild

IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)

Definition: nbtsort.c:296

_bt_begin_parallel

static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)

Definition: nbtsort.c:1398

_bt_buildadd

static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)

Definition: nbtsort.c:787

BTBuildState

struct BTBuildState BTBuildState

_bt_parallel_build_main

void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)

Definition: nbtsort.c:1743

BTLeader

struct BTLeader BTLeader

_bt_build_callback

static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)

Definition: nbtsort.c:580

_bt_spools_heapscan

static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)

Definition: nbtsort.c:366

_bt_spooldestroy

static void _bt_spooldestroy(BTSpool *btspool)

Definition: nbtsort.c:518

_bt_uppershutdown

static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)

Definition: nbtsort.c:1066

PARALLEL_KEY_TUPLESORT

#define PARALLEL_KEY_TUPLESORT

Definition: nbtsort.c:63

PARALLEL_KEY_QUERY_TEXT

#define PARALLEL_KEY_QUERY_TEXT

Definition: nbtsort.c:65

PARALLEL_KEY_WAL_USAGE

#define PARALLEL_KEY_WAL_USAGE

Definition: nbtsort.c:66

_bt_blnewpage

static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level)

Definition: nbtsort.c:609

_bt_leader_participate_as_worker

static void _bt_leader_participate_as_worker(BTBuildState *buildstate)

Definition: nbtsort.c:1690

BTWriteState

struct BTWriteState BTWriteState

_bt_spool

static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)

Definition: nbtsort.c:528

BTShared

struct BTShared BTShared

_bt_check_third_page

void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)

Definition: nbtutils.c:4270

_bt_mkscankey

BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)

Definition: nbtutils.c:97

_bt_truncate

IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)

Definition: nbtutils.c:3844

_bt_keep_natts_fast

int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)

Definition: nbtutils.c:4063

_bt_allequalimage

bool _bt_allequalimage(Relation rel, bool debugmessage)

Definition: nbtutils.c:4327

InvalidOffsetNumber

#define InvalidOffsetNumber

Definition: off.h:26

OffsetNumberNext

#define OffsetNumberNext(offsetNumber)

Definition: off.h:52

OffsetNumber

uint16 OffsetNumber

Definition: off.h:24

OffsetNumberPrev

#define OffsetNumberPrev(offsetNumber)

Definition: off.h:54

merge

static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)

Definition: pairingheap.c:79

buf

static char * buf

Definition: pg_test_fsync.c:72

progress

static int progress

Definition: pgbench.c:262

pgstat.h

debug_query_string

const char * debug_query_string

Definition: postgres.c:88

ShowUsage

void ShowUsage(const char *title)

Definition: postgres.c:5067

ResetUsage

void ResetUsage(void)

Definition: postgres.c:5060

postgres.h

Datum

uint64_t Datum

Definition: postgres.h:70

Oid

unsigned int Oid

Definition: postgres_ext.h:32

PROC_IN_SAFE_IC

#define PROC_IN_SAFE_IC

Definition: proc.h:59

progress.h

PROGRESS_CREATEIDX_TUPLES_TOTAL

#define PROGRESS_CREATEIDX_TUPLES_TOTAL

Definition: progress.h:89

PROGRESS_SCAN_BLOCKS_DONE

#define PROGRESS_SCAN_BLOCKS_DONE

Definition: progress.h:125

PROGRESS_CREATEIDX_TUPLES_DONE

#define PROGRESS_CREATEIDX_TUPLES_DONE

Definition: progress.h:90

PROGRESS_CREATEIDX_SUBPHASE

#define PROGRESS_CREATEIDX_SUBPHASE

Definition: progress.h:88

PROGRESS_SCAN_BLOCKS_TOTAL

#define PROGRESS_SCAN_BLOCKS_TOTAL

Definition: progress.h:124

rel.h

RelationGetRelid

#define RelationGetRelid(relation)

Definition: rel.h:514

RelationGetDescr

#define RelationGetDescr(relation)

Definition: rel.h:540

RelationGetRelationName

#define RelationGetRelationName(relation)

Definition: rel.h:548

IndexRelationGetNumberOfKeyAttributes

#define IndexRelationGetNumberOfKeyAttributes(relation)

Definition: rel.h:533

MAIN_FORKNUM

@ MAIN_FORKNUM

Definition: relpath.h:58

relscan.h

shm_toc_allocate

void * shm_toc_allocate(shm_toc *toc, Size nbytes)

Definition: shm_toc.c:88

shm_toc_insert

void shm_toc_insert(shm_toc *toc, uint64 key, void *address)

Definition: shm_toc.c:171

shm_toc_lookup

void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)

Definition: shm_toc.c:232

shm_toc_estimate_chunk

#define shm_toc_estimate_chunk(e, sz)

Definition: shm_toc.h:51

shm_toc_estimate_keys

#define shm_toc_estimate_keys(e, cnt)

Definition: shm_toc.h:53

add_size

Size add_size(Size s1, Size s2)

Definition: shmem.c:493

mul_size

Size mul_size(Size s1, Size s2)

Definition: shmem.c:510

GetTransactionSnapshot

Snapshot GetTransactionSnapshot(void)

Definition: snapmgr.c:271

UnregisterSnapshot

void UnregisterSnapshot(Snapshot snapshot)

Definition: snapmgr.c:864

RegisterSnapshot

Snapshot RegisterSnapshot(Snapshot snapshot)

Definition: snapmgr.c:822

SnapshotAny

#define SnapshotAny

Definition: snapmgr.h:33

IsMVCCSnapshot

#define IsMVCCSnapshot(snapshot)

Definition: snapmgr.h:55

PrepareSortSupportFromIndexRel

void PrepareSortSupportFromIndexRel(Relation indexRel, bool reverse, SortSupport ssup)

Definition: sortsupport.c:161

sortsupport.h

SortSupport

struct SortSupportData * SortSupport

Definition: sortsupport.h:58

ApplySortComparator

static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)

Definition: sortsupport.h:200

SpinLockInit

#define SpinLockInit(lock)

Definition: spin.h:57

SpinLockRelease

#define SpinLockRelease(lock)

Definition: spin.h:61

SpinLockAcquire

#define SpinLockAcquire(lock)

Definition: spin.h:59

MyProc

PGPROC * MyProc

Definition: proc.c:66

BTBuildState

Definition: nbtsort.c:206

BTBuildState::isunique

bool isunique

Definition: nbtsort.c:207

BTBuildState::spool

BTSpool * spool

Definition: nbtsort.c:211

BTBuildState::btleader

BTLeader * btleader

Definition: nbtsort.c:225

BTBuildState::nulls_not_distinct

bool nulls_not_distinct

Definition: nbtsort.c:208

BTBuildState::havedead

bool havedead

Definition: nbtsort.c:209

BTBuildState::heap

Relation heap

Definition: nbtsort.c:210

BTBuildState::spool2

BTSpool * spool2

Definition: nbtsort.c:217

BTBuildState::indtuples

double indtuples

Definition: nbtsort.c:218

BTDedupStateData

Definition: nbtree.h:877

BTDedupStateData::maxpostingsize

Size maxpostingsize

Definition: nbtree.h:881

BTDedupStateData::htids

ItemPointer htids

Definition: nbtree.h:889

BTDedupStateData::deduplicate

bool deduplicate

Definition: nbtree.h:879

BTDedupStateData::nhtids

int nhtids

Definition: nbtree.h:890

BTDedupStateData::baseoff

OffsetNumber baseoff

Definition: nbtree.h:885

BTDedupStateData::basetupsize

Size basetupsize

Definition: nbtree.h:886

BTDedupStateData::nitems

int nitems

Definition: nbtree.h:891

BTDedupStateData::nintervals

int nintervals

Definition: nbtree.h:900

BTDedupStateData::base

IndexTuple base

Definition: nbtree.h:884

BTDedupStateData::phystupsize

Size phystupsize

Definition: nbtree.h:892

BTDedupStateData::nmaxitems

int nmaxitems

Definition: nbtree.h:880

BTLeader

Definition: nbtsort.c:169

BTLeader::pcxt

ParallelContext * pcxt

Definition: nbtsort.c:171

BTLeader::btshared

BTShared * btshared

Definition: nbtsort.c:191

BTLeader::sharedsort

Sharedsort * sharedsort

Definition: nbtsort.c:192

BTLeader::sharedsort2

Sharedsort * sharedsort2

Definition: nbtsort.c:193

BTLeader::nparticipanttuplesorts

int nparticipanttuplesorts

Definition: nbtsort.c:179

BTLeader::bufferusage

BufferUsage * bufferusage

Definition: nbtsort.c:196

BTLeader::snapshot

Snapshot snapshot

Definition: nbtsort.c:194

BTLeader::walusage

WalUsage * walusage

Definition: nbtsort.c:195

BTPageOpaqueData

Definition: nbtree.h:64

BTPageOpaqueData::btpo_next

BlockNumber btpo_next

Definition: nbtree.h:66

BTPageOpaqueData::btpo_prev

BlockNumber btpo_prev

Definition: nbtree.h:65

BTPageOpaqueData::btpo_flags

uint16 btpo_flags

Definition: nbtree.h:68

BTPageOpaqueData::btpo_level

uint32 btpo_level

Definition: nbtree.h:67

BTPageOpaqueData::btpo_cycleid

BTCycleId btpo_cycleid

Definition: nbtree.h:69

BTPageState

Definition: nbtsort.c:233

BTPageState::btps_lowkey

IndexTuple btps_lowkey

Definition: nbtsort.c:236

BTPageState::btps_full

Size btps_full

Definition: nbtsort.c:240

BTPageState::btps_buf

BulkWriteBuffer btps_buf

Definition: nbtsort.c:234

BTPageState::btps_lastoff

OffsetNumber btps_lastoff

Definition: nbtsort.c:237

BTPageState::btps_lastextra

Size btps_lastextra

Definition: nbtsort.c:238

BTPageState::btps_blkno

BlockNumber btps_blkno

Definition: nbtsort.c:235

BTPageState::btps_next

struct BTPageState * btps_next

Definition: nbtsort.c:241

BTPageState::btps_level

uint32 btps_level

Definition: nbtsort.c:239

BTScanInsertData

Definition: nbtree.h:796

BTScanInsertData::allequalimage

bool allequalimage

Definition: nbtree.h:798

BTScanInsertData::scankeys

ScanKeyData scankeys[INDEX_MAX_KEYS]

Definition: nbtree.h:804

BTShared

Definition: nbtsort.c:95

BTShared::mutex

slock_t mutex

Definition: nbtsort.c:125

BTShared::isconcurrent

bool isconcurrent

Definition: nbtsort.c:105

BTShared::indtuples

double indtuples

Definition: nbtsort.c:146

BTShared::reltuples

double reltuples

Definition: nbtsort.c:144

BTShared::heaprelid

Oid heaprelid

Definition: nbtsort.c:101

BTShared::brokenhotchain

bool brokenhotchain

Definition: nbtsort.c:147

BTShared::queryid

int64 queryid

Definition: nbtsort.c:109

BTShared::isunique

bool isunique

Definition: nbtsort.c:103

BTShared::nparticipantsdone

int nparticipantsdone

Definition: nbtsort.c:143

BTShared::workersdonecv

ConditionVariable workersdonecv

Definition: nbtsort.c:117

BTShared::scantuplesortstates

int scantuplesortstates

Definition: nbtsort.c:106

BTShared::indexrelid

Oid indexrelid

Definition: nbtsort.c:102

BTShared::havedead

bool havedead

Definition: nbtsort.c:145

BTShared::nulls_not_distinct

bool nulls_not_distinct

Definition: nbtsort.c:104

BTSpool

Definition: nbtsort.c:81

BTSpool::isunique

bool isunique

Definition: nbtsort.c:85

BTSpool::nulls_not_distinct

bool nulls_not_distinct

Definition: nbtsort.c:86

BTSpool::heap

Relation heap

Definition: nbtsort.c:83

BTSpool::index

Relation index

Definition: nbtsort.c:84

BTSpool::sortstate

Tuplesortstate * sortstate

Definition: nbtsort.c:82

BTWriteState

Definition: nbtsort.c:248

BTWriteState::bulkstate

BulkWriteState * bulkstate

Definition: nbtsort.c:251

BTWriteState::heap

Relation heap

Definition: nbtsort.c:249

BTWriteState::index

Relation index

Definition: nbtsort.c:250

BTWriteState::btws_pages_alloced

BlockNumber btws_pages_alloced

Definition: nbtsort.c:253

BTWriteState::inskey

BTScanInsert inskey

Definition: nbtsort.c:252

BufferUsage

Definition: instrument.h:25

BulkWriteState

Definition: bulk_write.c:62

ConditionVariable

Definition: condition_variable.h:29

IndexBuildResult

Definition: genam.h:58

IndexBuildResult::heap_tuples

double heap_tuples

Definition: genam.h:59

IndexBuildResult::index_tuples

double index_tuples

Definition: genam.h:60

IndexInfo

Definition: execnodes.h:163

IndexInfo::ii_Unique

bool ii_Unique

Definition: execnodes.h:200

IndexInfo::ii_BrokenHotChain

bool ii_BrokenHotChain

Definition: execnodes.h:212

IndexInfo::ii_NullsNotDistinct

bool ii_NullsNotDistinct

Definition: execnodes.h:202

IndexInfo::ii_ParallelWorkers

int ii_ParallelWorkers

Definition: execnodes.h:218

IndexInfo::ii_Concurrent

bool ii_Concurrent

Definition: execnodes.h:210

IndexTupleData

Definition: itup.h:36

IndexTupleData::t_tid

ItemPointerData t_tid

Definition: itup.h:37

IndexTupleData::t_info

unsigned short t_info

Definition: itup.h:49

ItemIdData

Definition: itemid.h:26

ItemPointerData

Definition: itemptr.h:37

PGPROC::statusFlags

uint8 statusFlags

Definition: proc.h:259

ParallelContext

Definition: parallel.h:32

ParallelContext::seg

dsm_segment * seg

Definition: parallel.h:42

ParallelContext::estimator

shm_toc_estimator estimator

Definition: parallel.h:41

ParallelContext::toc

shm_toc * toc

Definition: parallel.h:44

ParallelContext::nworkers_launched

int nworkers_launched

Definition: parallel.h:37

ParallelContext::nworkers

int nworkers

Definition: parallel.h:35

RelationData

Definition: rel.h:56

ScanKeyData

Definition: skey.h:65

ScanKeyData::sk_flags

int sk_flags

Definition: skey.h:66

ScanKeyData::sk_collation

Oid sk_collation

Definition: skey.h:70

ScanKeyData::sk_attno

AttrNumber sk_attno

Definition: skey.h:67

Sharedsort

Definition: tuplesort.c:344

SnapshotData

Definition: snapshot.h:139

SortCoordinateData

Definition: tuplesort.h:47

SortCoordinateData::sharedsort

Sharedsort * sharedsort

Definition: tuplesort.h:59

SortCoordinateData::isWorker

bool isWorker

Definition: tuplesort.h:49

SortCoordinateData::nParticipants

int nParticipants

Definition: tuplesort.h:56

SortSupportData

Definition: sortsupport.h:61

SortSupportData::abbreviate

bool abbreviate

Definition: sortsupport.h:155

SortSupportData::ssup_attno

AttrNumber ssup_attno

Definition: sortsupport.h:81

SortSupportData::ssup_nulls_first

bool ssup_nulls_first

Definition: sortsupport.h:75

SortSupportData::ssup_collation

Oid ssup_collation

Definition: sortsupport.h:67

SortSupportData::ssup_cxt

MemoryContext ssup_cxt

Definition: sortsupport.h:66

TableScanDescData

Definition: relscan.h:34

TupleDescData

Definition: tupdesc.h:136

Tuplesortstate

Definition: tuplesort.c:186

WalUsage

Definition: instrument.h:52

dsm_segment

Definition: dsm.c:67

index

Definition: type.h:96

shm_toc

Definition: shm_toc.c:27

state

Definition: regguts.h:323

table_close

void table_close(Relation relation, LOCKMODE lockmode)

Definition: table.c:126

table_open

Relation table_open(Oid relationId, LOCKMODE lockmode)

Definition: table.c:40

table.h

table_beginscan_parallel

TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)

Definition: tableam.c:166

table_parallelscan_estimate

Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)

Definition: tableam.c:131

table_parallelscan_initialize

void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)

Definition: tableam.c:146

tableam.h

table_index_build_scan

static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)

Definition: tableam.h:1744

tcopprot.h

tuplesort_performsort

void tuplesort_performsort(Tuplesortstate *state)

Definition: tuplesort.c:1359

tuplesort_initialize_shared

void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)

Definition: tuplesort.c:2932

tuplesort_estimate_shared

Size tuplesort_estimate_shared(int nWorkers)

Definition: tuplesort.c:2911

tuplesort_end

void tuplesort_end(Tuplesortstate *state)

Definition: tuplesort.c:947

tuplesort_attach_shared

void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)

Definition: tuplesort.c:2955

tuplesort.h

SortCoordinate

struct SortCoordinateData * SortCoordinate

Definition: tuplesort.h:62

TUPLESORT_NONE

#define TUPLESORT_NONE

Definition: tuplesort.h:94

tuplesort_getindextuple

IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)

Definition: tuplesortvariants.c:1046

tuplesort_putindextuplevalues

void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, const Datum *values, const bool *isnull)

Definition: tuplesortvariants.c:818

tuplesort_begin_index_btree

Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt)

Definition: tuplesortvariants.c:358

PGIOAlignedBlock

Definition: c.h:1132

ExitParallelMode

void ExitParallelMode(void)

Definition: xact.c:1064

EnterParallelMode

void EnterParallelMode(void)

Definition: xact.c:1051

xact.h

PostgreSQL Source Code: src/backend/access/nbtree/nbtsort.c Source File