[フレーム]

brin.c

1/*

2 * brin.c

3 * Implementation of BRIN indexes for Postgres

4 *

5 * See src/backend/access/brin/README for details.

6 *

9 *

10 * IDENTIFICATION

11 * src/backend/access/brin/brin.c

12 *

13 * TODO

14 * * ScalarArrayOpExpr (amsearcharray -> SK_SEARCHARRAY)

15 */

16#include "postgres.h"

17

18#include "access/brin.h"

19#include "access/brin_page.h"

20#include "access/brin_pageops.h"

21#include "access/brin_xlog.h"

22#include "access/relation.h"

23#include "access/reloptions.h"

24#include "access/relscan.h"

25#include "access/table.h"

26#include "access/tableam.h"

27#include "access/xloginsert.h"

28#include "catalog/index.h"

29#include "catalog/pg_am.h"

30#include "commands/vacuum.h"

31#include "miscadmin.h"

32#include "pgstat.h"

33#include "postmaster/autovacuum.h"

34#include "storage/bufmgr.h"

35#include "storage/freespace.h"

36#include "tcop/tcopprot.h"

37#include "utils/acl.h"

38#include "utils/datum.h"

39#include "utils/fmgrprotos.h"

40#include "utils/guc.h"

41#include "utils/index_selfuncs.h"

42#include "utils/memutils.h"

43#include "utils/rel.h"

44#include "utils/tuplesort.h"

45

46/* Magic numbers for parallel state sharing */

47 #define PARALLEL_KEY_BRIN_SHARED UINT64CONST(0xB000000000000001)

48 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xB000000000000002)

49 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xB000000000000003)

50 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xB000000000000004)

51 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xB000000000000005)

52

53/*

54 * Status for index builds performed in parallel. This is allocated in a

55 * dynamic shared memory segment.

56 */

57 typedef struct BrinShared

58{

59 /*

60 * These fields are not modified during the build. They primarily exist

61 * for the benefit of worker processes that need to create state

62 * corresponding to that used by the leader.

63 */

64 Oid heaprelid;

65 Oid indexrelid;

66 bool isconcurrent;

67 BlockNumber pagesPerRange;

68 int scantuplesortstates;

69

70 /* Query ID, for report in worker processes */

71 int64 queryid;

72

73 /*

74 * workersdonecv is used to monitor the progress of workers. All parallel

75 * participants must indicate that they are done before leader can use

76 * results built by the workers (and before leader can write the data into

77 * the index).

78 */

79 ConditionVariable workersdonecv;

80

81 /*

82 * mutex protects all fields before heapdesc.

83 *

84 * These fields contain status information of interest to BRIN index

85 * builds that must work just the same when an index is built in parallel.

86 */

87 slock_t mutex;

88

89 /*

90 * Mutable state that is maintained by workers, and reported back to

91 * leader at end of the scans.

92 *

93 * nparticipantsdone is number of worker processes finished.

94 *

95 * reltuples is the total number of input heap tuples.

96 *

97 * indtuples is the total number of tuples that made it into the index.

98 */

99 int nparticipantsdone;

100 double reltuples;

101 double indtuples;

102

103 /*

104 * ParallelTableScanDescData data follows. Can't directly embed here, as

105 * implementations of the parallel table scan desc interface might need

106 * stronger alignment.

107 */

108 } BrinShared;

109

110/*

111 * Return pointer to a BrinShared's parallel table scan.

112 *

113 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just

114 * MAXALIGN.

115 */

116 #define ParallelTableScanFromBrinShared(shared) \

117 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BrinShared)))

118

119/*

120 * Status for leader in parallel index build.

121 */

122 typedef struct BrinLeader

123{

124 /* parallel context itself */

125 ParallelContext *pcxt;

126

127 /*

128 * nparticipanttuplesorts is the exact number of worker processes

129 * successfully launched, plus one leader process if it participates as a

130 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader

131 * participating as a worker).

132 */

133 int nparticipanttuplesorts;

134

135 /*

136 * Leader process convenience pointers to shared state (leader avoids TOC

137 * lookups).

138 *

139 * brinshared is the shared state for entire build. sharedsort is the

140 * shared, tuplesort-managed state passed to each process tuplesort.

141 * snapshot is the snapshot used by the scan iff an MVCC snapshot is

142 * required.

143 */

144 BrinShared *brinshared;

145 Sharedsort *sharedsort;

146 Snapshot snapshot;

147 WalUsage *walusage;

148 BufferUsage *bufferusage;

149 } BrinLeader;

150

151/*

152 * We use a BrinBuildState during initial construction of a BRIN index.

153 * The running state is kept in a BrinMemTuple.

154 */

155 typedef struct BrinBuildState

156{

157 Relation bs_irel;

158 double bs_numtuples;

159 double bs_reltuples;

160 Buffer bs_currentInsertBuf;

161 BlockNumber bs_pagesPerRange;

162 BlockNumber bs_currRangeStart;

163 BlockNumber bs_maxRangeStart;

164 BrinRevmap *bs_rmAccess;

165 BrinDesc *bs_bdesc;

166 BrinMemTuple *bs_dtuple;

167

168 BrinTuple *bs_emptyTuple;

169 Size bs_emptyTupleLen;

170 MemoryContext bs_context;

171

172 /*

173 * bs_leader is only present when a parallel index build is performed, and

174 * only in the leader process. (Actually, only the leader process has a

175 * BrinBuildState.)

176 */

177 BrinLeader *bs_leader;

178 int bs_worker_id;

179

180 /*

181 * The sortstate is used by workers (including the leader). It has to be

182 * part of the build state, because that's the only thing passed to the

183 * build callback etc.

184 */

185 Tuplesortstate *bs_sortstate;

186 } BrinBuildState;

187

188/*

189 * We use a BrinInsertState to capture running state spanning multiple

190 * brininsert invocations, within the same command.

191 */

192 typedef struct BrinInsertState

193{

194 BrinRevmap *bis_rmAccess;

195 BrinDesc *bis_desc;

196 BlockNumber bis_pages_per_range;

197 } BrinInsertState;

198

199/*

200 * Struct used as "opaque" during index scans

201 */

202 typedef struct BrinOpaque

203{

204 BlockNumber bo_pagesPerRange;

205 BrinRevmap *bo_rmAccess;

206 BrinDesc *bo_bdesc;

207 } BrinOpaque;

208

209 #define BRIN_ALL_BLOCKRANGES InvalidBlockNumber

210

211static BrinBuildState *initialize_brin_buildstate(Relation idxRel,

212 BrinRevmap *revmap,

213 BlockNumber pagesPerRange,

214 BlockNumber tablePages);

215static BrinInsertState *initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo);

216static void terminate_brin_buildstate(BrinBuildState *state);

217static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,

218 bool include_partial, double *numSummarized, double *numExisting);

219static void form_and_insert_tuple(BrinBuildState *state);

220static void form_and_spill_tuple(BrinBuildState *state);

221static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a,

222 BrinTuple *b);

223static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy);

224static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc,

225 BrinMemTuple *dtup, const Datum *values, const bool *nulls);

226static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys);

227static void brin_fill_empty_ranges(BrinBuildState *state,

228 BlockNumber prevRange, BlockNumber nextRange);

229

230/* parallel index builds */

231static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,

232 bool isconcurrent, int request);

233static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state);

234static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot);

235static double _brin_parallel_heapscan(BrinBuildState *state);

236static double _brin_parallel_merge(BrinBuildState *state);

237static void _brin_leader_participate_as_worker(BrinBuildState *buildstate,

238 Relation heap, Relation index);

239static void _brin_parallel_scan_and_build(BrinBuildState *state,

240 BrinShared *brinshared,

241 Sharedsort *sharedsort,

242 Relation heap, Relation index,

243 int sortmem, bool progress);

244

245/*

246 * BRIN handler function: return IndexAmRoutine with access method parameters

247 * and callbacks.

248 */

249Datum

250 brinhandler(PG_FUNCTION_ARGS)

251{

252 IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);

253

254 amroutine->amstrategies = 0;

255 amroutine->amsupport = BRIN_LAST_OPTIONAL_PROCNUM;

256 amroutine->amoptsprocnum = BRIN_PROCNUM_OPTIONS;

257 amroutine->amcanorder = false;

258 amroutine->amcanorderbyop = false;

259 amroutine->amcanhash = false;

260 amroutine->amconsistentequality = false;

261 amroutine->amconsistentordering = false;

262 amroutine->amcanbackward = false;

263 amroutine->amcanunique = false;

264 amroutine->amcanmulticol = true;

265 amroutine->amoptionalkey = true;

266 amroutine->amsearcharray = false;

267 amroutine->amsearchnulls = true;

268 amroutine->amstorage = true;

269 amroutine->amclusterable = false;

270 amroutine->ampredlocks = false;

271 amroutine->amcanparallel = false;

272 amroutine->amcanbuildparallel = true;

273 amroutine->amcaninclude = false;

274 amroutine->amusemaintenanceworkmem = false;

275 amroutine->amsummarizing = true;

276 amroutine->amparallelvacuumoptions =

277 VACUUM_OPTION_PARALLEL_CLEANUP;

278 amroutine->amkeytype = InvalidOid;

279

280 amroutine->ambuild = brinbuild;

281 amroutine->ambuildempty = brinbuildempty;

282 amroutine->aminsert = brininsert;

283 amroutine->aminsertcleanup = brininsertcleanup;

284 amroutine->ambulkdelete = brinbulkdelete;

285 amroutine->amvacuumcleanup = brinvacuumcleanup;

286 amroutine->amcanreturn = NULL;

287 amroutine->amcostestimate = brincostestimate;

288 amroutine->amgettreeheight = NULL;

289 amroutine->amoptions = brinoptions;

290 amroutine->amproperty = NULL;

291 amroutine->ambuildphasename = NULL;

292 amroutine->amvalidate = brinvalidate;

293 amroutine->amadjustmembers = NULL;

294 amroutine->ambeginscan = brinbeginscan;

295 amroutine->amrescan = brinrescan;

296 amroutine->amgettuple = NULL;

297 amroutine->amgetbitmap = bringetbitmap;

298 amroutine->amendscan = brinendscan;

299 amroutine->ammarkpos = NULL;

300 amroutine->amrestrpos = NULL;

301 amroutine->amestimateparallelscan = NULL;

302 amroutine->aminitparallelscan = NULL;

303 amroutine->amparallelrescan = NULL;

304 amroutine->amtranslatestrategy = NULL;

305 amroutine->amtranslatecmptype = NULL;

306

307 PG_RETURN_POINTER(amroutine);

308}

309

310/*

311 * Initialize a BrinInsertState to maintain state to be used across multiple

312 * tuple inserts, within the same command.

313 */

314static BrinInsertState *

315 initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)

316{

317 BrinInsertState *bistate;

318 MemoryContext oldcxt;

319

320 oldcxt = MemoryContextSwitchTo(indexInfo->ii_Context);

321 bistate = palloc0(sizeof(BrinInsertState));

322 bistate->bis_desc = brin_build_desc(idxRel);

323 bistate->bis_rmAccess = brinRevmapInitialize(idxRel,

324 &bistate->bis_pages_per_range);

325 indexInfo->ii_AmCache = bistate;

326 MemoryContextSwitchTo(oldcxt);

327

328 return bistate;

329}

330

331/*

332 * A tuple in the heap is being inserted. To keep a brin index up to date,

333 * we need to obtain the relevant index tuple and compare its stored values

334 * with those of the new tuple. If the tuple values are not consistent with

335 * the summary tuple, we need to update the index tuple.

336 *

337 * If autosummarization is enabled, check if we need to summarize the previous

338 * page range.

339 *

340 * If the range is not currently summarized (i.e. the revmap returns NULL for

341 * it), there's nothing to do for this tuple.

342 */

343bool

344 brininsert(Relation idxRel, Datum *values, bool *nulls,

345 ItemPointer heaptid, Relation heapRel,

346 IndexUniqueCheck checkUnique,

347 bool indexUnchanged,

348 IndexInfo *indexInfo)

349{

350 BlockNumber pagesPerRange;

351 BlockNumber origHeapBlk;

352 BlockNumber heapBlk;

353 BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;

354 BrinRevmap *revmap;

355 BrinDesc *bdesc;

356 Buffer buf = InvalidBuffer;

357 MemoryContext tupcxt = NULL;

358 MemoryContext oldcxt = CurrentMemoryContext;

359 bool autosummarize = BrinGetAutoSummarize(idxRel);

360

361 /*

362 * If first time through in this statement, initialize the insert state

363 * that we keep for all the inserts in the command.

364 */

365 if (!bistate)

366 bistate = initialize_brin_insertstate(idxRel, indexInfo);

367

368 revmap = bistate->bis_rmAccess;

369 bdesc = bistate->bis_desc;

370 pagesPerRange = bistate->bis_pages_per_range;

371

372 /*

373 * origHeapBlk is the block number where the insertion occurred. heapBlk

374 * is the first block in the corresponding page range.

375 */

376 origHeapBlk = ItemPointerGetBlockNumber(heaptid);

377 heapBlk = (origHeapBlk / pagesPerRange) * pagesPerRange;

378

379 for (;;)

380 {

381 bool need_insert = false;

382 OffsetNumber off;

383 BrinTuple *brtup;

384 BrinMemTuple *dtup;

385

386 CHECK_FOR_INTERRUPTS();

387

388 /*

389 * If auto-summarization is enabled and we just inserted the first

390 * tuple into the first block of a new non-first page range, request a

391 * summarization run of the previous range.

392 */

393 if (autosummarize &&

394 heapBlk > 0 &&

395 heapBlk == origHeapBlk &&

396 ItemPointerGetOffsetNumber(heaptid) == FirstOffsetNumber)

397 {

398 BlockNumber lastPageRange = heapBlk - 1;

399 BrinTuple *lastPageTuple;

400

401 lastPageTuple =

402 brinGetTupleForHeapBlock(revmap, lastPageRange, &buf, &off,

403 NULL, BUFFER_LOCK_SHARE);

404 if (!lastPageTuple)

405 {

406 bool recorded;

407

408 recorded = AutoVacuumRequestWork(AVW_BRINSummarizeRange,

409 RelationGetRelid(idxRel),

410 lastPageRange);

411 if (!recorded)

412 ereport(LOG,

413 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),

414 errmsg("request for BRIN range summarization for index \"%s\" page %u was not recorded",

415 RelationGetRelationName(idxRel),

416 lastPageRange)));

417 }

418 else

419 LockBuffer(buf, BUFFER_LOCK_UNLOCK);

420 }

421

422 brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off,

423 NULL, BUFFER_LOCK_SHARE);

424

425 /* if range is unsummarized, there's nothing to do */

426 if (!brtup)

427 break;

428

429 /* First time through in this brininsert call? */

430 if (tupcxt == NULL)

431 {

432 tupcxt = AllocSetContextCreate(CurrentMemoryContext,

433 "brininsert cxt",

434 ALLOCSET_DEFAULT_SIZES);

435 MemoryContextSwitchTo(tupcxt);

436 }

437

438 dtup = brin_deform_tuple(bdesc, brtup, NULL);

439

440 need_insert = add_values_to_range(idxRel, bdesc, dtup, values, nulls);

441

442 if (!need_insert)

443 {

444 /*

445 * The tuple is consistent with the new values, so there's nothing

446 * to do.

447 */

448 LockBuffer(buf, BUFFER_LOCK_UNLOCK);

449 }

450 else

451 {

452 Page page = BufferGetPage(buf);

453 ItemId lp = PageGetItemId(page, off);

454 Size origsz;

455 BrinTuple *origtup;

456 Size newsz;

457 BrinTuple *newtup;

458 bool samepage;

459

460 /*

461 * Make a copy of the old tuple, so that we can compare it after

462 * re-acquiring the lock.

463 */

464 origsz = ItemIdGetLength(lp);

465 origtup = brin_copy_tuple(brtup, origsz, NULL, NULL);

466

467 /*

468 * Before releasing the lock, check if we can attempt a same-page

469 * update. Another process could insert a tuple concurrently in

470 * the same page though, so downstream we must be prepared to cope

471 * if this turns out to not be possible after all.

472 */

473 newtup = brin_form_tuple(bdesc, heapBlk, dtup, &newsz);

474 samepage = brin_can_do_samepage_update(buf, origsz, newsz);

475 LockBuffer(buf, BUFFER_LOCK_UNLOCK);

476

477 /*

478 * Try to update the tuple. If this doesn't work for whatever

479 * reason, we need to restart from the top; the revmap might be

480 * pointing at a different tuple for this block now, so we need to

481 * recompute to ensure both our new heap tuple and the other

482 * inserter's are covered by the combined tuple. It might be that

483 * we don't need to update at all.

484 */

485 if (!brin_doupdate(idxRel, pagesPerRange, revmap, heapBlk,

486 buf, off, origtup, origsz, newtup, newsz,

487 samepage))

488 {

489 /* no luck; start over */

490 MemoryContextReset(tupcxt);

491 continue;

492 }

493 }

494

495 /* success! */

496 break;

497 }

498

499 if (BufferIsValid(buf))

500 ReleaseBuffer(buf);

501 MemoryContextSwitchTo(oldcxt);

502 if (tupcxt != NULL)

503 MemoryContextDelete(tupcxt);

504

505 return false;

506}

507

508/*

509 * Callback to clean up the BrinInsertState once all tuple inserts are done.

510 */

511void

512 brininsertcleanup(Relation index, IndexInfo *indexInfo)

513{

514 BrinInsertState *bistate = (BrinInsertState *) indexInfo->ii_AmCache;

515

516 /* bail out if cache not initialized */

517 if (bistate == NULL)

518 return;

519

520 /* do this first to avoid dangling pointer if we fail partway through */

521 indexInfo->ii_AmCache = NULL;

522

523 /*

524 * Clean up the revmap. Note that the brinDesc has already been cleaned up

525 * as part of its own memory context.

526 */

527 brinRevmapTerminate(bistate->bis_rmAccess);

528 pfree(bistate);

529}

530

531/*

532 * Initialize state for a BRIN index scan.

533 *

534 * We read the metapage here to determine the pages-per-range number that this

535 * index was built with. Note that since this cannot be changed while we're

536 * holding lock on index, it's not necessary to recompute it during brinrescan.

537 */

538IndexScanDesc

539 brinbeginscan(Relation r, int nkeys, int norderbys)

540{

541 IndexScanDesc scan;

542 BrinOpaque *opaque;

543

544 scan = RelationGetIndexScan(r, nkeys, norderbys);

545

546 opaque = palloc_object(BrinOpaque);

547 opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);

548 opaque->bo_bdesc = brin_build_desc(r);

549 scan->opaque = opaque;

550

551 return scan;

552}

553

554/*

555 * Execute the index scan.

556 *

557 * This works by reading index TIDs from the revmap, and obtaining the index

558 * tuples pointed to by them; the summary values in the index tuples are

559 * compared to the scan keys. We return into the TID bitmap all the pages in

560 * ranges corresponding to index tuples that match the scan keys.

561 *

562 * If a TID from the revmap is read as InvalidTID, we know that range is

563 * unsummarized. Pages in those ranges need to be returned regardless of scan

564 * keys.

565 */

566int64

567 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)

568{

569 Relation idxRel = scan->indexRelation;

570 Buffer buf = InvalidBuffer;

571 BrinDesc *bdesc;

572 Oid heapOid;

573 Relation heapRel;

574 BrinOpaque *opaque;

575 BlockNumber nblocks;

576 BlockNumber heapBlk;

577 int64 totalpages = 0;

578 FmgrInfo *consistentFn;

579 MemoryContext oldcxt;

580 MemoryContext perRangeCxt;

581 BrinMemTuple *dtup;

582 BrinTuple *btup = NULL;

583 Size btupsz = 0;

584 ScanKey **keys,

585 **nullkeys;

586 int *nkeys,

587 *nnullkeys;

588 char *ptr;

589 Size len;

590 char *tmp PG_USED_FOR_ASSERTS_ONLY;

591

592 opaque = (BrinOpaque *) scan->opaque;

593 bdesc = opaque->bo_bdesc;

594 pgstat_count_index_scan(idxRel);

595 if (scan->instrument)

596 scan->instrument->nsearches++;

597

598 /*

599 * We need to know the size of the table so that we know how long to

600 * iterate on the revmap.

601 */

602 heapOid = IndexGetRelation(RelationGetRelid(idxRel), false);

603 heapRel = table_open(heapOid, AccessShareLock);

604 nblocks = RelationGetNumberOfBlocks(heapRel);

605 table_close(heapRel, AccessShareLock);

606

607 /*

608 * Make room for the consistent support procedures of indexed columns. We

609 * don't look them up here; we do that lazily the first time we see a scan

610 * key reference each of them. We rely on zeroing fn_oid to InvalidOid.

611 */

612 consistentFn = palloc0_array(FmgrInfo, bdesc->bd_tupdesc->natts);

613

614 /*

615 * Make room for per-attribute lists of scan keys that we'll pass to the

616 * consistent support procedure. We don't know which attributes have scan

617 * keys, so we allocate space for all attributes. That may use more memory

618 * but it's probably cheaper than determining which attributes are used.

619 *

620 * We keep null and regular keys separate, so that we can pass just the

621 * regular keys to the consistent function easily.

622 *

623 * To reduce the allocation overhead, we allocate one big chunk and then

624 * carve it into smaller arrays ourselves. All the pieces have exactly the

625 * same lifetime, so that's OK.

626 *

627 * XXX The widest index can have 32 attributes, so the amount of wasted

628 * memory is negligible. We could invent a more compact approach (with

629 * just space for used attributes) but that would make the matching more

630 * complex so it's not a good trade-off.

631 */

632 len =

633 MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* regular keys */

634 MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +

635 MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts) +

636 MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts) + /* NULL keys */

637 MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys) * bdesc->bd_tupdesc->natts +

638 MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);

639

640 ptr = palloc(len);

641 tmp = ptr;

642

643 keys = (ScanKey **) ptr;

644 ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);

645

646 nullkeys = (ScanKey **) ptr;

647 ptr += MAXALIGN(sizeof(ScanKey *) * bdesc->bd_tupdesc->natts);

648

649 nkeys = (int *) ptr;

650 ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);

651

652 nnullkeys = (int *) ptr;

653 ptr += MAXALIGN(sizeof(int) * bdesc->bd_tupdesc->natts);

654

655 for (int i = 0; i < bdesc->bd_tupdesc->natts; i++)

656 {

657 keys[i] = (ScanKey *) ptr;

658 ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);

659

660 nullkeys[i] = (ScanKey *) ptr;

661 ptr += MAXALIGN(sizeof(ScanKey) * scan->numberOfKeys);

662 }

663

664 Assert(tmp + len == ptr);

665

666 /* zero the number of keys */

667 memset(nkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);

668 memset(nnullkeys, 0, sizeof(int) * bdesc->bd_tupdesc->natts);

669

670 /* Preprocess the scan keys - split them into per-attribute arrays. */

671 for (int keyno = 0; keyno < scan->numberOfKeys; keyno++)

672 {

673 ScanKey key = &scan->keyData[keyno];

674 AttrNumber keyattno = key->sk_attno;

675

676 /*

677 * The collation of the scan key must match the collation used in the

678 * index column (but only if the search is not IS NULL/ IS NOT NULL).

679 * Otherwise we shouldn't be using this index ...

680 */

681 Assert((key->sk_flags & SK_ISNULL) ||

682 (key->sk_collation ==

683 TupleDescAttr(bdesc->bd_tupdesc,

684 keyattno - 1)->attcollation));

685

686 /*

687 * First time we see this index attribute, so init as needed.

688 *

689 * This is a bit of an overkill - we don't know how many scan keys are

690 * there for this attribute, so we simply allocate the largest number

691 * possible (as if all keys were for this attribute). This may waste a

692 * bit of memory, but we only expect small number of scan keys in

693 * general, so this should be negligible, and repeated repalloc calls

694 * are not free either.

695 */

696 if (consistentFn[keyattno - 1].fn_oid == InvalidOid)

697 {

698 FmgrInfo *tmp;

699

700 /* First time we see this attribute, so no key/null keys. */

701 Assert(nkeys[keyattno - 1] == 0);

702 Assert(nnullkeys[keyattno - 1] == 0);

703

704 tmp = index_getprocinfo(idxRel, keyattno,

705 BRIN_PROCNUM_CONSISTENT);

706 fmgr_info_copy(&consistentFn[keyattno - 1], tmp,

707 CurrentMemoryContext);

708 }

709

710 /* Add key to the proper per-attribute array. */

711 if (key->sk_flags & SK_ISNULL)

712 {

713 nullkeys[keyattno - 1][nnullkeys[keyattno - 1]] = key;

714 nnullkeys[keyattno - 1]++;

715 }

716 else

717 {

718 keys[keyattno - 1][nkeys[keyattno - 1]] = key;

719 nkeys[keyattno - 1]++;

720 }

721 }

722

723 /* allocate an initial in-memory tuple, out of the per-range memcxt */

724 dtup = brin_new_memtuple(bdesc);

725

726 /*

727 * Setup and use a per-range memory context, which is reset every time we

728 * loop below. This avoids having to free the tuples within the loop.

729 */

730 perRangeCxt = AllocSetContextCreate(CurrentMemoryContext,

731 "bringetbitmap cxt",

732 ALLOCSET_DEFAULT_SIZES);

733 oldcxt = MemoryContextSwitchTo(perRangeCxt);

734

735 /*

736 * Now scan the revmap. We start by querying for heap page 0,

737 * incrementing by the number of pages per range; this gives us a full

738 * view of the table.

739 */

740 for (heapBlk = 0; heapBlk < nblocks; heapBlk += opaque->bo_pagesPerRange)

741 {

742 bool addrange;

743 bool gottuple = false;

744 BrinTuple *tup;

745 OffsetNumber off;

746 Size size;

747

748 CHECK_FOR_INTERRUPTS();

749

750 MemoryContextReset(perRangeCxt);

751

752 tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,

753 &off, &size, BUFFER_LOCK_SHARE);

754 if (tup)

755 {

756 gottuple = true;

757 btup = brin_copy_tuple(tup, size, btup, &btupsz);

758 LockBuffer(buf, BUFFER_LOCK_UNLOCK);

759 }

760

761 /*

762 * For page ranges with no indexed tuple, we must return the whole

763 * range; otherwise, compare it to the scan keys.

764 */

765 if (!gottuple)

766 {

767 addrange = true;

768 }

769 else

770 {

771 dtup = brin_deform_tuple(bdesc, btup, dtup);

772 if (dtup->bt_placeholder)

773 {

774 /*

775 * Placeholder tuples are always returned, regardless of the

776 * values stored in them.

777 */

778 addrange = true;

779 }

780 else

781 {

782 int attno;

783

784 /*

785 * Compare scan keys with summary values stored for the range.

786 * If scan keys are matched, the page range must be added to

787 * the bitmap. We initially assume the range needs to be

788 * added; in particular this serves the case where there are

789 * no keys.

790 */

791 addrange = true;

792 for (attno = 1; attno <= bdesc->bd_tupdesc->natts; attno++)

793 {

794 BrinValues *bval;

795 Datum add;

796 Oid collation;

797

798 /*

799 * skip attributes without any scan keys (both regular and

800 * IS [NOT] NULL)

801 */

802 if (nkeys[attno - 1] == 0 && nnullkeys[attno - 1] == 0)

803 continue;

804

805 bval = &dtup->bt_columns[attno - 1];

806

807 /*

808 * If the BRIN tuple indicates that this range is empty,

809 * we can skip it: there's nothing to match. We don't

810 * need to examine the next columns.

811 */

812 if (dtup->bt_empty_range)

813 {

814 addrange = false;

815 break;

816 }

817

818 /*

819 * First check if there are any IS [NOT] NULL scan keys,

820 * and if we're violating them. In that case we can

821 * terminate early, without invoking the support function.

822 *

823 * As there may be more keys, we can only determine

824 * mismatch within this loop.

825 */

826 if (bdesc->bd_info[attno - 1]->oi_regular_nulls &&

827 !check_null_keys(bval, nullkeys[attno - 1],

828 nnullkeys[attno - 1]))

829 {

830 /*

831 * If any of the IS [NOT] NULL keys failed, the page

832 * range as a whole can't pass. So terminate the loop.

833 */

834 addrange = false;

835 break;

836 }

837

838 /*

839 * So either there are no IS [NOT] NULL keys, or all

840 * passed. If there are no regular scan keys, we're done -

841 * the page range matches. If there are regular keys, but

842 * the page range is marked as 'all nulls' it can't

843 * possibly pass (we're assuming the operators are

844 * strict).

845 */

846

847 /* No regular scan keys - page range as a whole passes. */

848 if (!nkeys[attno - 1])

849 continue;

850

851 Assert((nkeys[attno - 1] > 0) &&

852 (nkeys[attno - 1] <= scan->numberOfKeys));

853

854 /* If it is all nulls, it cannot possibly be consistent. */

855 if (bval->bv_allnulls)

856 {

857 addrange = false;

858 break;

859 }

860

861 /*

862 * Collation from the first key (has to be the same for

863 * all keys for the same attribute).

864 */

865 collation = keys[attno - 1][0]->sk_collation;

866

867 /*

868 * Check whether the scan key is consistent with the page

869 * range values; if so, have the pages in the range added

870 * to the output bitmap.

871 *

872 * The opclass may or may not support processing of

873 * multiple scan keys. We can determine that based on the

874 * number of arguments - functions with extra parameter

875 * (number of scan keys) do support this, otherwise we

876 * have to simply pass the scan keys one by one.

877 */

878 if (consistentFn[attno - 1].fn_nargs >= 4)

879 {

880 /* Check all keys at once */

881 add = FunctionCall4Coll(&consistentFn[attno - 1],

882 collation,

883 PointerGetDatum(bdesc),

884 PointerGetDatum(bval),

885 PointerGetDatum(keys[attno - 1]),

886 Int32GetDatum(nkeys[attno - 1]));

887 addrange = DatumGetBool(add);

888 }

889 else

890 {

891 /*

892 * Check keys one by one

893 *

894 * When there are multiple scan keys, failure to meet

895 * the criteria for a single one of them is enough to

896 * discard the range as a whole, so break out of the

897 * loop as soon as a false return value is obtained.

898 */

899 int keyno;

900

901 for (keyno = 0; keyno < nkeys[attno - 1]; keyno++)

902 {

903 add = FunctionCall3Coll(&consistentFn[attno - 1],

904 keys[attno - 1][keyno]->sk_collation,

905 PointerGetDatum(bdesc),

906 PointerGetDatum(bval),

907 PointerGetDatum(keys[attno - 1][keyno]));

908 addrange = DatumGetBool(add);

909 if (!addrange)

910 break;

911 }

912 }

913

914 /*

915 * If we found a scan key eliminating the range, no need

916 * to check additional ones.

917 */

918 if (!addrange)

919 break;

920 }

921 }

922 }

923

924 /* add the pages in the range to the output bitmap, if needed */

925 if (addrange)

926 {

927 BlockNumber pageno;

928

929 for (pageno = heapBlk;

930 pageno <= Min(nblocks, heapBlk + opaque->bo_pagesPerRange) - 1;

931 pageno++)

932 {

933 MemoryContextSwitchTo(oldcxt);

934 tbm_add_page(tbm, pageno);

935 totalpages++;

936 MemoryContextSwitchTo(perRangeCxt);

937 }

938 }

939 }

940

941 MemoryContextSwitchTo(oldcxt);

942 MemoryContextDelete(perRangeCxt);

943

944 if (buf != InvalidBuffer)

945 ReleaseBuffer(buf);

946

947 /*

948 * XXX We have an approximation of the number of *pages* that our scan

949 * returns, but we don't have a precise idea of the number of heap tuples

950 * involved.

951 */

952 return totalpages * 10;

953}

954

955/*

956 * Re-initialize state for a BRIN index scan

957 */

958void

959 brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,

960 ScanKey orderbys, int norderbys)

961{

962 /*

963 * Other index AMs preprocess the scan keys at this point, or sometime

964 * early during the scan; this lets them optimize by removing redundant

965 * keys, or doing early returns when they are impossible to satisfy; see

966 * _bt_preprocess_keys for an example. Something like that could be added

967 * here someday, too.

968 */

969

970 if (scankey && scan->numberOfKeys > 0)

971 memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));

972}

973

974/*

975 * Close down a BRIN index scan

976 */

977void

978 brinendscan(IndexScanDesc scan)

979{

980 BrinOpaque *opaque = (BrinOpaque *) scan->opaque;

981

982 brinRevmapTerminate(opaque->bo_rmAccess);

983 brin_free_desc(opaque->bo_bdesc);

984 pfree(opaque);

985}

986

987/*

988 * Per-heap-tuple callback for table_index_build_scan.

989 *

990 * Note we don't worry about the page range at the end of the table here; it is

991 * present in the build state struct after we're called the last time, but not

992 * inserted into the index. Caller must ensure to do so, if appropriate.

993 */

994static void

995 brinbuildCallback(Relation index,

996 ItemPointer tid,

997 Datum *values,

998 bool *isnull,

999 bool tupleIsAlive,

1000 void *brstate)

1001{

1002 BrinBuildState *state = (BrinBuildState *) brstate;

1003 BlockNumber thisblock;

1004

1005 thisblock = ItemPointerGetBlockNumber(tid);

1006

1007 /*

1008 * If we're in a block that belongs to a future range, summarize what

1009 * we've got and start afresh. Note the scan might have skipped many

1010 * pages, if they were devoid of live tuples; make sure to insert index

1011 * tuples for those too.

1012 */

1013 while (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1)

1014 {

1015

1016 BRIN_elog((DEBUG2,

1017 "brinbuildCallback: completed a range: %u--%u",

1018 state->bs_currRangeStart,

1019 state->bs_currRangeStart + state->bs_pagesPerRange));

1020

1021 /* create the index tuple and insert it */

1022 form_and_insert_tuple(state);

1023

1024 /* set state to correspond to the next range */

1025 state->bs_currRangeStart += state->bs_pagesPerRange;

1026

1027 /* re-initialize state for it */

1028 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);

1029 }

1030

1031 /* Accumulate the current tuple into the running state */

1032 (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,

1033 values, isnull);

1034}

1035

1036/*

1037 * Per-heap-tuple callback for table_index_build_scan with parallelism.

1038 *

1039 * A version of the callback used by parallel index builds. The main difference

1040 * is that instead of writing the BRIN tuples into the index, we write them

1041 * into a shared tuplesort, and leave the insertion up to the leader (which may

1042 * reorder them a bit etc.). The callback also does not generate empty ranges,

1043 * those will be added by the leader when merging results from workers.

1044 */

1045static void

1046 brinbuildCallbackParallel(Relation index,

1047 ItemPointer tid,

1048 Datum *values,

1049 bool *isnull,

1050 bool tupleIsAlive,

1051 void *brstate)

1052{

1053 BrinBuildState *state = (BrinBuildState *) brstate;

1054 BlockNumber thisblock;

1055

1056 thisblock = ItemPointerGetBlockNumber(tid);

1057

1058 /*

1059 * If we're in a block that belongs to a different range, summarize what

1060 * we've got and start afresh. Note the scan might have skipped many

1061 * pages, if they were devoid of live tuples; we do not create empty BRIN

1062 * ranges here - the leader is responsible for filling them in.

1063 *

1064 * Unlike serial builds, parallel index builds allow synchronized seqscans

1065 * (because that's what parallel scans do). This means the block may wrap

1066 * around to the beginning of the relation, so the condition needs to

1067 * check for both future and past ranges.

1068 */

1069 if ((thisblock < state->bs_currRangeStart) ||

1070 (thisblock > state->bs_currRangeStart + state->bs_pagesPerRange - 1))

1071 {

1072

1073 BRIN_elog((DEBUG2,

1074 "brinbuildCallbackParallel: completed a range: %u--%u",

1075 state->bs_currRangeStart,

1076 state->bs_currRangeStart + state->bs_pagesPerRange));

1077

1078 /* create the index tuple and write it into the tuplesort */

1079 form_and_spill_tuple(state);

1080

1081 /*

1082 * Set state to correspond to the next range (for this block).

1083 *

1084 * This skips ranges that are either empty (and so we don't get any

1085 * tuples to summarize), or processed by other workers. We can't

1086 * differentiate those cases here easily, so we leave it up to the

1087 * leader to fill empty ranges where needed.

1088 */

1089 state->bs_currRangeStart

1090 = state->bs_pagesPerRange * (thisblock / state->bs_pagesPerRange);

1091

1092 /* re-initialize state for it */

1093 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);

1094 }

1095

1096 /* Accumulate the current tuple into the running state */

1097 (void) add_values_to_range(index, state->bs_bdesc, state->bs_dtuple,

1098 values, isnull);

1099}

1100

1101/*

1102 * brinbuild() -- build a new BRIN index.

1103 */

1104IndexBuildResult *

1105 brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)

1106{

1107 IndexBuildResult *result;

1108 double reltuples;

1109 double idxtuples;

1110 BrinRevmap *revmap;

1111 BrinBuildState *state;

1112 Buffer meta;

1113 BlockNumber pagesPerRange;

1114

1115 /*

1116 * We expect to be called exactly once for any index relation.

1117 */

1118 if (RelationGetNumberOfBlocks(index) != 0)

1119 elog(ERROR, "index \"%s\" already contains data",

1120 RelationGetRelationName(index));

1121

1122 /*

1123 * Critical section not required, because on error the creation of the

1124 * whole relation will be rolled back.

1125 */

1126

1127 meta = ExtendBufferedRel(BMR_REL(index), MAIN_FORKNUM, NULL,

1128 EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);

1129 Assert(BufferGetBlockNumber(meta) == BRIN_METAPAGE_BLKNO);

1130

1131 brin_metapage_init(BufferGetPage(meta), BrinGetPagesPerRange(index),

1132 BRIN_CURRENT_VERSION);

1133 MarkBufferDirty(meta);

1134

1135 if (RelationNeedsWAL(index))

1136 {

1137 xl_brin_createidx xlrec;

1138 XLogRecPtr recptr;

1139 Page page;

1140

1141 xlrec.version = BRIN_CURRENT_VERSION;

1142 xlrec.pagesPerRange = BrinGetPagesPerRange(index);

1143

1144 XLogBeginInsert();

1145 XLogRegisterData(&xlrec, SizeOfBrinCreateIdx);

1146 XLogRegisterBuffer(0, meta, REGBUF_WILL_INIT | REGBUF_STANDARD);

1147

1148 recptr = XLogInsert(RM_BRIN_ID, XLOG_BRIN_CREATE_INDEX);

1149

1150 page = BufferGetPage(meta);

1151 PageSetLSN(page, recptr);

1152 }

1153

1154 UnlockReleaseBuffer(meta);

1155

1156 /*

1157 * Initialize our state, including the deformed tuple state.

1158 */

1159 revmap = brinRevmapInitialize(index, &pagesPerRange);

1160 state = initialize_brin_buildstate(index, revmap, pagesPerRange,

1161 RelationGetNumberOfBlocks(heap));

1162

1163 /*

1164 * Attempt to launch parallel worker scan when required

1165 *

1166 * XXX plan_create_index_workers makes the number of workers dependent on

1167 * maintenance_work_mem, requiring 32MB for each worker. That makes sense

1168 * for btree, but not for BRIN, which can do with much less memory. So

1169 * maybe make that somehow less strict, optionally?

1170 */

1171 if (indexInfo->ii_ParallelWorkers > 0)

1172 _brin_begin_parallel(state, heap, index, indexInfo->ii_Concurrent,

1173 indexInfo->ii_ParallelWorkers);

1174

1175 /*

1176 * If parallel build requested and at least one worker process was

1177 * successfully launched, set up coordination state, wait for workers to

1178 * complete. Then read all tuples from the shared tuplesort and insert

1179 * them into the index.

1180 *

1181 * In serial mode, simply scan the table and build the index one index

1182 * tuple at a time.

1183 */

1184 if (state->bs_leader)

1185 {

1186 SortCoordinate coordinate;

1187

1188 coordinate = (SortCoordinate) palloc0(sizeof(SortCoordinateData));

1189 coordinate->isWorker = false;

1190 coordinate->nParticipants =

1191 state->bs_leader->nparticipanttuplesorts;

1192 coordinate->sharedsort = state->bs_leader->sharedsort;

1193

1194 /*

1195 * Begin leader tuplesort.

1196 *

1197 * In cases where parallelism is involved, the leader receives the

1198 * same share of maintenance_work_mem as a serial sort (it is

1199 * generally treated in the same way as a serial sort once we return).

1200 * Parallel worker Tuplesortstates will have received only a fraction

1201 * of maintenance_work_mem, though.

1202 *

1203 * We rely on the lifetime of the Leader Tuplesortstate almost not

1204 * overlapping with any worker Tuplesortstate's lifetime. There may

1205 * be some small overlap, but that's okay because we rely on leader

1206 * Tuplesortstate only allocating a small, fixed amount of memory

1207 * here. When its tuplesort_performsort() is called (by our caller),

1208 * and significant amounts of memory are likely to be used, all

1209 * workers must have already freed almost all memory held by their

1210 * Tuplesortstates (they are about to go away completely, too). The

1211 * overall effect is that maintenance_work_mem always represents an

1212 * absolute high watermark on the amount of memory used by a CREATE

1213 * INDEX operation, regardless of the use of parallelism or any other

1214 * factor.

1215 */

1216 state->bs_sortstate =

1217 tuplesort_begin_index_brin(maintenance_work_mem, coordinate,

1218 TUPLESORT_NONE);

1219

1220 /* scan the relation and merge per-worker results */

1221 reltuples = _brin_parallel_merge(state);

1222

1223 _brin_end_parallel(state->bs_leader, state);

1224 }

1225 else /* no parallel index build */

1226 {

1227 /*

1228 * Now scan the relation. No syncscan allowed here because we want

1229 * the heap blocks in physical order (we want to produce the ranges

1230 * starting from block 0, and the callback also relies on this to not

1231 * generate summary for the same range twice).

1232 */

1233 reltuples = table_index_build_scan(heap, index, indexInfo, false, true,

1234 brinbuildCallback, state, NULL);

1235

1236 /*

1237 * process the final batch

1238 *

1239 * XXX Note this does not update state->bs_currRangeStart, i.e. it

1240 * stays set to the last range added to the index. This is OK, because

1241 * that's what brin_fill_empty_ranges expects.

1242 */

1243 form_and_insert_tuple(state);

1244

1245 /*

1246 * Backfill the final ranges with empty data.

1247 *

1248 * This saves us from doing what amounts to full table scans when the

1249 * index with a predicate like WHERE (nonnull_column IS NULL), or

1250 * other very selective predicates.

1251 */

1252 brin_fill_empty_ranges(state,

1253 state->bs_currRangeStart,

1254 state->bs_maxRangeStart);

1255 }

1256

1257 /* release resources */

1258 idxtuples = state->bs_numtuples;

1259 brinRevmapTerminate(state->bs_rmAccess);

1260 terminate_brin_buildstate(state);

1261

1262 /*

1263 * Return statistics

1264 */

1265 result = palloc_object(IndexBuildResult);

1266

1267 result->heap_tuples = reltuples;

1268 result->index_tuples = idxtuples;

1269

1270 return result;

1271}

1272

1273void

1274 brinbuildempty(Relation index)

1275{

1276 Buffer metabuf;

1277

1278 /* An empty BRIN index has a metapage only. */

1279 metabuf = ExtendBufferedRel(BMR_REL(index), INIT_FORKNUM, NULL,

1280 EB_LOCK_FIRST | EB_SKIP_EXTENSION_LOCK);

1281

1282 /* Initialize and xlog metabuffer. */

1283 START_CRIT_SECTION();

1284 brin_metapage_init(BufferGetPage(metabuf), BrinGetPagesPerRange(index),

1285 BRIN_CURRENT_VERSION);

1286 MarkBufferDirty(metabuf);

1287 log_newpage_buffer(metabuf, true);

1288 END_CRIT_SECTION();

1289

1290 UnlockReleaseBuffer(metabuf);

1291}

1292

1293/*

1294 * brinbulkdelete

1295 * Since there are no per-heap-tuple index tuples in BRIN indexes,

1296 * there's not a lot we can do here.

1297 *

1298 * XXX we could mark item tuples as "dirty" (when a minimum or maximum heap

1299 * tuple is deleted), meaning the need to re-run summarization on the affected

1300 * range. Would need to add an extra flag in brintuples for that.

1301 */

1302IndexBulkDeleteResult *

1303 brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,

1304 IndexBulkDeleteCallback callback, void *callback_state)

1305{

1306 /* allocate stats if first time through, else re-use existing struct */

1307 if (stats == NULL)

1308 stats = palloc0_object(IndexBulkDeleteResult);

1309

1310 return stats;

1311}

1312

1313/*

1314 * This routine is in charge of "vacuuming" a BRIN index: we just summarize

1315 * ranges that are currently unsummarized.

1316 */

1317IndexBulkDeleteResult *

1318 brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)

1319{

1320 Relation heapRel;

1321

1322 /* No-op in ANALYZE ONLY mode */

1323 if (info->analyze_only)

1324 return stats;

1325

1326 if (!stats)

1327 stats = palloc0_object(IndexBulkDeleteResult);

1328 stats->num_pages = RelationGetNumberOfBlocks(info->index);

1329 /* rest of stats is initialized by zeroing */

1330

1331 heapRel = table_open(IndexGetRelation(RelationGetRelid(info->index), false),

1332 AccessShareLock);

1333

1334 brin_vacuum_scan(info->index, info->strategy);

1335

1336 brinsummarize(info->index, heapRel, BRIN_ALL_BLOCKRANGES, false,

1337 &stats->num_index_tuples, &stats->num_index_tuples);

1338

1339 table_close(heapRel, AccessShareLock);

1340

1341 return stats;

1342}

1343

1344/*

1345 * reloptions processor for BRIN indexes

1346 */

1347bytea *

1348 brinoptions(Datum reloptions, bool validate)

1349{

1350 static const relopt_parse_elt tab[] = {

1351 {"pages_per_range", RELOPT_TYPE_INT, offsetof(BrinOptions, pagesPerRange)},

1352 {"autosummarize", RELOPT_TYPE_BOOL, offsetof(BrinOptions, autosummarize)}

1353 };

1354

1355 return (bytea *) build_reloptions(reloptions, validate,

1356 RELOPT_KIND_BRIN,

1357 sizeof(BrinOptions),

1358 tab, lengthof(tab));

1359}

1360

1361/*

1362 * SQL-callable function to scan through an index and summarize all ranges

1363 * that are not currently summarized.

1364 */

1365Datum

1366 brin_summarize_new_values(PG_FUNCTION_ARGS)

1367{

1368 Datum relation = PG_GETARG_DATUM(0);

1369

1370 return DirectFunctionCall2(brin_summarize_range,

1371 relation,

1372 Int64GetDatum((int64) BRIN_ALL_BLOCKRANGES));

1373}

1374

1375/*

1376 * SQL-callable function to summarize the indicated page range, if not already

1377 * summarized. If the second argument is BRIN_ALL_BLOCKRANGES, all

1378 * unsummarized ranges are summarized.

1379 */

1380Datum

1381 brin_summarize_range(PG_FUNCTION_ARGS)

1382{

1383 Oid indexoid = PG_GETARG_OID(0);

1384 int64 heapBlk64 = PG_GETARG_INT64(1);

1385 BlockNumber heapBlk;

1386 Oid heapoid;

1387 Relation indexRel;

1388 Relation heapRel;

1389 Oid save_userid;

1390 int save_sec_context;

1391 int save_nestlevel;

1392 double numSummarized = 0;

1393

1394 if (RecoveryInProgress())

1395 ereport(ERROR,

1396 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),

1397 errmsg("recovery is in progress"),

1398 errhint("BRIN control functions cannot be executed during recovery.")));

1399

1400 if (heapBlk64 > BRIN_ALL_BLOCKRANGES || heapBlk64 < 0)

1401 ereport(ERROR,

1402 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),

1403 errmsg("block number out of range: %" PRId64, heapBlk64)));

1404 heapBlk = (BlockNumber) heapBlk64;

1405

1406 /*

1407 * We must lock table before index to avoid deadlocks. However, if the

1408 * passed indexoid isn't an index then IndexGetRelation() will fail.

1409 * Rather than emitting a not-very-helpful error message, postpone

1410 * complaining, expecting that the is-it-an-index test below will fail.

1411 */

1412 heapoid = IndexGetRelation(indexoid, true);

1413 if (OidIsValid(heapoid))

1414 {

1415 heapRel = table_open(heapoid, ShareUpdateExclusiveLock);

1416

1417 /*

1418 * Autovacuum calls us. For its benefit, switch to the table owner's

1419 * userid, so that any index functions are run as that user. Also

1420 * lock down security-restricted operations and arrange to make GUC

1421 * variable changes local to this command. This is harmless, albeit

1422 * unnecessary, when called from SQL, because we fail shortly if the

1423 * user does not own the index.

1424 */

1425 GetUserIdAndSecContext(&save_userid, &save_sec_context);

1426 SetUserIdAndSecContext(heapRel->rd_rel->relowner,

1427 save_sec_context | SECURITY_RESTRICTED_OPERATION);

1428 save_nestlevel = NewGUCNestLevel();

1429 RestrictSearchPath();

1430 }

1431 else

1432 {

1433 heapRel = NULL;

1434 /* Set these just to suppress "uninitialized variable" warnings */

1435 save_userid = InvalidOid;

1436 save_sec_context = -1;

1437 save_nestlevel = -1;

1438 }

1439

1440 indexRel = index_open(indexoid, ShareUpdateExclusiveLock);

1441

1442 /* Must be a BRIN index */

1443 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||

1444 indexRel->rd_rel->relam != BRIN_AM_OID)

1445 ereport(ERROR,

1446 (errcode(ERRCODE_WRONG_OBJECT_TYPE),

1447 errmsg("\"%s\" is not a BRIN index",

1448 RelationGetRelationName(indexRel))));

1449

1450 /* User must own the index (comparable to privileges needed for VACUUM) */

1451 if (heapRel != NULL && !object_ownercheck(RelationRelationId, indexoid, save_userid))

1452 aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,

1453 RelationGetRelationName(indexRel));

1454

1455 /*

1456 * Since we did the IndexGetRelation call above without any lock, it's

1457 * barely possible that a race against an index drop/recreation could have

1458 * netted us the wrong table. Recheck.

1459 */

1460 if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))

1461 ereport(ERROR,

1462 (errcode(ERRCODE_UNDEFINED_TABLE),

1463 errmsg("could not open parent table of index \"%s\"",

1464 RelationGetRelationName(indexRel))));

1465

1466 /* see gin_clean_pending_list() */

1467 if (indexRel->rd_index->indisvalid)

1468 brinsummarize(indexRel, heapRel, heapBlk, true, &numSummarized, NULL);

1469 else

1470 ereport(DEBUG1,

1471 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),

1472 errmsg("index \"%s\" is not valid",

1473 RelationGetRelationName(indexRel))));

1474

1475 /* Roll back any GUC changes executed by index functions */

1476 AtEOXact_GUC(false, save_nestlevel);

1477

1478 /* Restore userid and security context */

1479 SetUserIdAndSecContext(save_userid, save_sec_context);

1480

1481 relation_close(indexRel, ShareUpdateExclusiveLock);

1482 relation_close(heapRel, ShareUpdateExclusiveLock);

1483

1484 PG_RETURN_INT32((int32) numSummarized);

1485}

1486

1487/*

1488 * SQL-callable interface to mark a range as no longer summarized

1489 */

1490Datum

1491 brin_desummarize_range(PG_FUNCTION_ARGS)

1492{

1493 Oid indexoid = PG_GETARG_OID(0);

1494 int64 heapBlk64 = PG_GETARG_INT64(1);

1495 BlockNumber heapBlk;

1496 Oid heapoid;

1497 Relation heapRel;

1498 Relation indexRel;

1499 bool done;

1500

1501 if (RecoveryInProgress())

1502 ereport(ERROR,

1503 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),

1504 errmsg("recovery is in progress"),

1505 errhint("BRIN control functions cannot be executed during recovery.")));

1506

1507 if (heapBlk64 > MaxBlockNumber || heapBlk64 < 0)

1508 ereport(ERROR,

1509 (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),

1510 errmsg("block number out of range: %" PRId64,

1511 heapBlk64)));

1512 heapBlk = (BlockNumber) heapBlk64;

1513

1514 /*

1515 * We must lock table before index to avoid deadlocks. However, if the

1516 * passed indexoid isn't an index then IndexGetRelation() will fail.

1517 * Rather than emitting a not-very-helpful error message, postpone

1518 * complaining, expecting that the is-it-an-index test below will fail.

1519 *

1520 * Unlike brin_summarize_range(), autovacuum never calls this. Hence, we

1521 * don't switch userid.

1522 */

1523 heapoid = IndexGetRelation(indexoid, true);

1524 if (OidIsValid(heapoid))

1525 heapRel = table_open(heapoid, ShareUpdateExclusiveLock);

1526 else

1527 heapRel = NULL;

1528

1529 indexRel = index_open(indexoid, ShareUpdateExclusiveLock);

1530

1531 /* Must be a BRIN index */

1532 if (indexRel->rd_rel->relkind != RELKIND_INDEX ||

1533 indexRel->rd_rel->relam != BRIN_AM_OID)

1534 ereport(ERROR,

1535 (errcode(ERRCODE_WRONG_OBJECT_TYPE),

1536 errmsg("\"%s\" is not a BRIN index",

1537 RelationGetRelationName(indexRel))));

1538

1539 /* User must own the index (comparable to privileges needed for VACUUM) */

1540 if (!object_ownercheck(RelationRelationId, indexoid, GetUserId()))

1541 aclcheck_error(ACLCHECK_NOT_OWNER, OBJECT_INDEX,

1542 RelationGetRelationName(indexRel));

1543

1544 /*

1545 * Since we did the IndexGetRelation call above without any lock, it's

1546 * barely possible that a race against an index drop/recreation could have

1547 * netted us the wrong table. Recheck.

1548 */

1549 if (heapRel == NULL || heapoid != IndexGetRelation(indexoid, false))

1550 ereport(ERROR,

1551 (errcode(ERRCODE_UNDEFINED_TABLE),

1552 errmsg("could not open parent table of index \"%s\"",

1553 RelationGetRelationName(indexRel))));

1554

1555 /* see gin_clean_pending_list() */

1556 if (indexRel->rd_index->indisvalid)

1557 {

1558 /* the revmap does the hard work */

1559 do

1560 {

1561 done = brinRevmapDesummarizeRange(indexRel, heapBlk);

1562 }

1563 while (!done);

1564 }

1565 else

1566 ereport(DEBUG1,

1567 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),

1568 errmsg("index \"%s\" is not valid",

1569 RelationGetRelationName(indexRel))));

1570

1571 relation_close(indexRel, ShareUpdateExclusiveLock);

1572 relation_close(heapRel, ShareUpdateExclusiveLock);

1573

1574 PG_RETURN_VOID();

1575}

1576

1577/*

1578 * Build a BrinDesc used to create or scan a BRIN index

1579 */

1580BrinDesc *

1581 brin_build_desc(Relation rel)

1582{

1583 BrinOpcInfo **opcinfo;

1584 BrinDesc *bdesc;

1585 TupleDesc tupdesc;

1586 int totalstored = 0;

1587 int keyno;

1588 long totalsize;

1589 MemoryContext cxt;

1590 MemoryContext oldcxt;

1591

1592 cxt = AllocSetContextCreate(CurrentMemoryContext,

1593 "brin desc cxt",

1594 ALLOCSET_SMALL_SIZES);

1595 oldcxt = MemoryContextSwitchTo(cxt);

1596 tupdesc = RelationGetDescr(rel);

1597

1598 /*

1599 * Obtain BrinOpcInfo for each indexed column. While at it, accumulate

1600 * the number of columns stored, since the number is opclass-defined.

1601 */

1602 opcinfo = palloc_array(BrinOpcInfo *, tupdesc->natts);

1603 for (keyno = 0; keyno < tupdesc->natts; keyno++)

1604 {

1605 FmgrInfo *opcInfoFn;

1606 Form_pg_attribute attr = TupleDescAttr(tupdesc, keyno);

1607

1608 opcInfoFn = index_getprocinfo(rel, keyno + 1, BRIN_PROCNUM_OPCINFO);

1609

1610 opcinfo[keyno] = (BrinOpcInfo *)

1611 DatumGetPointer(FunctionCall1(opcInfoFn, ObjectIdGetDatum(attr->atttypid)));

1612 totalstored += opcinfo[keyno]->oi_nstored;

1613 }

1614

1615 /* Allocate our result struct and fill it in */

1616 totalsize = offsetof(BrinDesc, bd_info) +

1617 sizeof(BrinOpcInfo *) * tupdesc->natts;

1618

1619 bdesc = palloc(totalsize);

1620 bdesc->bd_context = cxt;

1621 bdesc->bd_index = rel;

1622 bdesc->bd_tupdesc = tupdesc;

1623 bdesc->bd_disktdesc = NULL; /* generated lazily */

1624 bdesc->bd_totalstored = totalstored;

1625

1626 for (keyno = 0; keyno < tupdesc->natts; keyno++)

1627 bdesc->bd_info[keyno] = opcinfo[keyno];

1628 pfree(opcinfo);

1629

1630 MemoryContextSwitchTo(oldcxt);

1631

1632 return bdesc;

1633}

1634

1635void

1636 brin_free_desc(BrinDesc *bdesc)

1637{

1638 /* make sure the tupdesc is still valid */

1639 Assert(bdesc->bd_tupdesc->tdrefcount >= 1);

1640 /* no need for retail pfree */

1641 MemoryContextDelete(bdesc->bd_context);

1642}

1643

1644/*

1645 * Fetch index's statistical data into *stats

1646 */

1647void

1648 brinGetStats(Relation index, BrinStatsData *stats)

1649{

1650 Buffer metabuffer;

1651 Page metapage;

1652 BrinMetaPageData *metadata;

1653

1654 metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);

1655 LockBuffer(metabuffer, BUFFER_LOCK_SHARE);

1656 metapage = BufferGetPage(metabuffer);

1657 metadata = (BrinMetaPageData *) PageGetContents(metapage);

1658

1659 stats->pagesPerRange = metadata->pagesPerRange;

1660 stats->revmapNumPages = metadata->lastRevmapPage - 1;

1661

1662 UnlockReleaseBuffer(metabuffer);

1663}

1664

1665/*

1666 * Initialize a BrinBuildState appropriate to create tuples on the given index.

1667 */

1668static BrinBuildState *

1669 initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap,

1670 BlockNumber pagesPerRange, BlockNumber tablePages)

1671{

1672 BrinBuildState *state;

1673 BlockNumber lastRange = 0;

1674

1675 state = palloc_object(BrinBuildState);

1676

1677 state->bs_irel = idxRel;

1678 state->bs_numtuples = 0;

1679 state->bs_reltuples = 0;

1680 state->bs_currentInsertBuf = InvalidBuffer;

1681 state->bs_pagesPerRange = pagesPerRange;

1682 state->bs_currRangeStart = 0;

1683 state->bs_rmAccess = revmap;

1684 state->bs_bdesc = brin_build_desc(idxRel);

1685 state->bs_dtuple = brin_new_memtuple(state->bs_bdesc);

1686 state->bs_leader = NULL;

1687 state->bs_worker_id = 0;

1688 state->bs_sortstate = NULL;

1689 state->bs_context = CurrentMemoryContext;

1690 state->bs_emptyTuple = NULL;

1691 state->bs_emptyTupleLen = 0;

1692

1693 /* Remember the memory context to use for an empty tuple, if needed. */

1694 state->bs_context = CurrentMemoryContext;

1695 state->bs_emptyTuple = NULL;

1696 state->bs_emptyTupleLen = 0;

1697

1698 /*

1699 * Calculate the start of the last page range. Page numbers are 0-based,

1700 * so to calculate the index we need to subtract one. The integer division

1701 * gives us the index of the page range.

1702 */

1703 if (tablePages > 0)

1704 lastRange = ((tablePages - 1) / pagesPerRange) * pagesPerRange;

1705

1706 /* Now calculate the start of the next range. */

1707 state->bs_maxRangeStart = lastRange + state->bs_pagesPerRange;

1708

1709 return state;

1710}

1711

1712/*

1713 * Release resources associated with a BrinBuildState.

1714 */

1715static void

1716 terminate_brin_buildstate(BrinBuildState *state)

1717{

1718 /*

1719 * Release the last index buffer used. We might as well ensure that

1720 * whatever free space remains in that page is available in FSM, too.

1721 */

1722 if (!BufferIsInvalid(state->bs_currentInsertBuf))

1723 {

1724 Page page;

1725 Size freespace;

1726 BlockNumber blk;

1727

1728 page = BufferGetPage(state->bs_currentInsertBuf);

1729 freespace = PageGetFreeSpace(page);

1730 blk = BufferGetBlockNumber(state->bs_currentInsertBuf);

1731 ReleaseBuffer(state->bs_currentInsertBuf);

1732 RecordPageWithFreeSpace(state->bs_irel, blk, freespace);

1733 FreeSpaceMapVacuumRange(state->bs_irel, blk, blk + 1);

1734 }

1735

1736 brin_free_desc(state->bs_bdesc);

1737 pfree(state->bs_dtuple);

1738 pfree(state);

1739}

1740

1741/*

1742 * On the given BRIN index, summarize the heap page range that corresponds

1743 * to the heap block number given.

1744 *

1745 * This routine can run in parallel with insertions into the heap. To avoid

1746 * missing those values from the summary tuple, we first insert a placeholder

1747 * index tuple into the index, then execute the heap scan; transactions

1748 * concurrent with the scan update the placeholder tuple. After the scan, we

1749 * union the placeholder tuple with the one computed by this routine. The

1750 * update of the index value happens in a loop, so that if somebody updates

1751 * the placeholder tuple after we read it, we detect the case and try again.

1752 * This ensures that the concurrently inserted tuples are not lost.

1753 *

1754 * A further corner case is this routine being asked to summarize the partial

1755 * range at the end of the table. heapNumBlocks is the (possibly outdated)

1756 * table size; if we notice that the requested range lies beyond that size,

1757 * we re-compute the table size after inserting the placeholder tuple, to

1758 * avoid missing pages that were appended recently.

1759 */

1760static void

1761 summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,

1762 BlockNumber heapBlk, BlockNumber heapNumBlks)

1763{

1764 Buffer phbuf;

1765 BrinTuple *phtup;

1766 Size phsz;

1767 OffsetNumber offset;

1768 BlockNumber scanNumBlks;

1769

1770 /*

1771 * Insert the placeholder tuple

1772 */

1773 phbuf = InvalidBuffer;

1774 phtup = brin_form_placeholder_tuple(state->bs_bdesc, heapBlk, &phsz);

1775 offset = brin_doinsert(state->bs_irel, state->bs_pagesPerRange,

1776 state->bs_rmAccess, &phbuf,

1777 heapBlk, phtup, phsz);

1778

1779 /*

1780 * Compute range end. We hold ShareUpdateExclusive lock on table, so it

1781 * cannot shrink concurrently (but it can grow).

1782 */

1783 Assert(heapBlk % state->bs_pagesPerRange == 0);

1784 if (heapBlk + state->bs_pagesPerRange > heapNumBlks)

1785 {

1786 /*

1787 * If we're asked to scan what we believe to be the final range on the

1788 * table (i.e. a range that might be partial) we need to recompute our

1789 * idea of what the latest page is after inserting the placeholder

1790 * tuple. Anyone that grows the table later will update the

1791 * placeholder tuple, so it doesn't matter that we won't scan these

1792 * pages ourselves. Careful: the table might have been extended

1793 * beyond the current range, so clamp our result.

1794 *

1795 * Fortunately, this should occur infrequently.

1796 */

1797 scanNumBlks = Min(RelationGetNumberOfBlocks(heapRel) - heapBlk,

1798 state->bs_pagesPerRange);

1799 }

1800 else

1801 {

1802 /* Easy case: range is known to be complete */

1803 scanNumBlks = state->bs_pagesPerRange;

1804 }

1805

1806 /*

1807 * Execute the partial heap scan covering the heap blocks in the specified

1808 * page range, summarizing the heap tuples in it. This scan stops just

1809 * short of brinbuildCallback creating the new index entry.

1810 *

1811 * Note that it is critical we use the "any visible" mode of

1812 * table_index_build_range_scan here: otherwise, we would miss tuples

1813 * inserted by transactions that are still in progress, among other corner

1814 * cases.

1815 */

1816 state->bs_currRangeStart = heapBlk;

1817 table_index_build_range_scan(heapRel, state->bs_irel, indexInfo, false, true, false,

1818 heapBlk, scanNumBlks,

1819 brinbuildCallback, state, NULL);

1820

1821 /*

1822 * Now we update the values obtained by the scan with the placeholder

1823 * tuple. We do this in a loop which only terminates if we're able to

1824 * update the placeholder tuple successfully; if we are not, this means

1825 * somebody else modified the placeholder tuple after we read it.

1826 */

1827 for (;;)

1828 {

1829 BrinTuple *newtup;

1830 Size newsize;

1831 bool didupdate;

1832 bool samepage;

1833

1834 CHECK_FOR_INTERRUPTS();

1835

1836 /*

1837 * Update the summary tuple and try to update.

1838 */

1839 newtup = brin_form_tuple(state->bs_bdesc,

1840 heapBlk, state->bs_dtuple, &newsize);

1841 samepage = brin_can_do_samepage_update(phbuf, phsz, newsize);

1842 didupdate =

1843 brin_doupdate(state->bs_irel, state->bs_pagesPerRange,

1844 state->bs_rmAccess, heapBlk, phbuf, offset,

1845 phtup, phsz, newtup, newsize, samepage);

1846 brin_free_tuple(phtup);

1847 brin_free_tuple(newtup);

1848

1849 /* If the update succeeded, we're done. */

1850 if (didupdate)

1851 break;

1852

1853 /*

1854 * If the update didn't work, it might be because somebody updated the

1855 * placeholder tuple concurrently. Extract the new version, union it

1856 * with the values we have from the scan, and start over. (There are

1857 * other reasons for the update to fail, but it's simple to treat them

1858 * the same.)

1859 */

1860 phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,

1861 &offset, &phsz, BUFFER_LOCK_SHARE);

1862 /* the placeholder tuple must exist */

1863 if (phtup == NULL)

1864 elog(ERROR, "missing placeholder tuple");

1865 phtup = brin_copy_tuple(phtup, phsz, NULL, NULL);

1866 LockBuffer(phbuf, BUFFER_LOCK_UNLOCK);

1867

1868 /* merge it into the tuple from the heap scan */

1869 union_tuples(state->bs_bdesc, state->bs_dtuple, phtup);

1870 }

1871

1872 ReleaseBuffer(phbuf);

1873}

1874

1875/*

1876 * Summarize page ranges that are not already summarized. If pageRange is

1877 * BRIN_ALL_BLOCKRANGES then the whole table is scanned; otherwise, only the

1878 * page range containing the given heap page number is scanned.

1879 * If include_partial is true, then the partial range at the end of the table

1880 * is summarized, otherwise not.

1881 *

1882 * For each new index tuple inserted, *numSummarized (if not NULL) is

1883 * incremented; for each existing tuple, *numExisting (if not NULL) is

1884 * incremented.

1885 */

1886static void

1887 brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange,

1888 bool include_partial, double *numSummarized, double *numExisting)

1889{

1890 BrinRevmap *revmap;

1891 BrinBuildState *state = NULL;

1892 IndexInfo *indexInfo = NULL;

1893 BlockNumber heapNumBlocks;

1894 BlockNumber pagesPerRange;

1895 Buffer buf;

1896 BlockNumber startBlk;

1897

1898 revmap = brinRevmapInitialize(index, &pagesPerRange);

1899

1900 /* determine range of pages to process */

1901 heapNumBlocks = RelationGetNumberOfBlocks(heapRel);

1902 if (pageRange == BRIN_ALL_BLOCKRANGES)

1903 startBlk = 0;

1904 else

1905 {

1906 startBlk = (pageRange / pagesPerRange) * pagesPerRange;

1907 heapNumBlocks = Min(heapNumBlocks, startBlk + pagesPerRange);

1908 }

1909 if (startBlk > heapNumBlocks)

1910 {

1911 /* Nothing to do if start point is beyond end of table */

1912 brinRevmapTerminate(revmap);

1913 return;

1914 }

1915

1916 /*

1917 * Scan the revmap to find unsummarized items.

1918 */

1919 buf = InvalidBuffer;

1920 for (; startBlk < heapNumBlocks; startBlk += pagesPerRange)

1921 {

1922 BrinTuple *tup;

1923 OffsetNumber off;

1924

1925 /*

1926 * Unless requested to summarize even a partial range, go away now if

1927 * we think the next range is partial. Caller would pass true when it

1928 * is typically run once bulk data loading is done

1929 * (brin_summarize_new_values), and false when it is typically the

1930 * result of arbitrarily-scheduled maintenance command (vacuuming).

1931 */

1932 if (!include_partial &&

1933 (startBlk + pagesPerRange > heapNumBlocks))

1934 break;

1935

1936 CHECK_FOR_INTERRUPTS();

1937

1938 tup = brinGetTupleForHeapBlock(revmap, startBlk, &buf, &off, NULL,

1939 BUFFER_LOCK_SHARE);

1940 if (tup == NULL)

1941 {

1942 /* no revmap entry for this heap range. Summarize it. */

1943 if (state == NULL)

1944 {

1945 /* first time through */

1946 Assert(!indexInfo);

1947 state = initialize_brin_buildstate(index, revmap,

1948 pagesPerRange,

1949 InvalidBlockNumber);

1950 indexInfo = BuildIndexInfo(index);

1951 }

1952 summarize_range(indexInfo, state, heapRel, startBlk, heapNumBlocks);

1953

1954 /* and re-initialize state for the next range */

1955 brin_memtuple_initialize(state->bs_dtuple, state->bs_bdesc);

1956

1957 if (numSummarized)

1958 *numSummarized += 1.0;

1959 }

1960 else

1961 {

1962 if (numExisting)

1963 *numExisting += 1.0;

1964 LockBuffer(buf, BUFFER_LOCK_UNLOCK);

1965 }

1966 }

1967

1968 if (BufferIsValid(buf))

1969 ReleaseBuffer(buf);

1970

1971 /* free resources */

1972 brinRevmapTerminate(revmap);

1973 if (state)

1974 {

1975 terminate_brin_buildstate(state);

1976 pfree(indexInfo);

1977 }

1978}

1979

1980/*

1981 * Given a deformed tuple in the build state, convert it into the on-disk

1982 * format and insert it into the index, making the revmap point to it.

1983 */

1984static void

1985 form_and_insert_tuple(BrinBuildState *state)

1986{

1987 BrinTuple *tup;

1988 Size size;

1989

1990 tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,

1991 state->bs_dtuple, &size);

1992 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,

1993 &state->bs_currentInsertBuf, state->bs_currRangeStart,

1994 tup, size);

1995 state->bs_numtuples++;

1996

1997 pfree(tup);

1998}

1999

2000/*

2001 * Given a deformed tuple in the build state, convert it into the on-disk

2002 * format and write it to a (shared) tuplesort (the leader will insert it

2003 * into the index later).

2004 */

2005static void

2006 form_and_spill_tuple(BrinBuildState *state)

2007{

2008 BrinTuple *tup;

2009 Size size;

2010

2011 /* don't insert empty tuples in parallel build */

2012 if (state->bs_dtuple->bt_empty_range)

2013 return;

2014

2015 tup = brin_form_tuple(state->bs_bdesc, state->bs_currRangeStart,

2016 state->bs_dtuple, &size);

2017

2018 /* write the BRIN tuple to the tuplesort */

2019 tuplesort_putbrintuple(state->bs_sortstate, tup, size);

2020

2021 state->bs_numtuples++;

2022

2023 pfree(tup);

2024}

2025

2026/*

2027 * Given two deformed tuples, adjust the first one so that it's consistent

2028 * with the summary values in both.

2029 */

2030static void

2031 union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)

2032{

2033 int keyno;

2034 BrinMemTuple *db;

2035 MemoryContext cxt;

2036 MemoryContext oldcxt;

2037

2038 /* Use our own memory context to avoid retail pfree */

2039 cxt = AllocSetContextCreate(CurrentMemoryContext,

2040 "brin union",

2041 ALLOCSET_DEFAULT_SIZES);

2042 oldcxt = MemoryContextSwitchTo(cxt);

2043 db = brin_deform_tuple(bdesc, b, NULL);

2044 MemoryContextSwitchTo(oldcxt);

2045

2046 /*

2047 * Check if the ranges are empty.

2048 *

2049 * If at least one of them is empty, we don't need to call per-key union

2050 * functions at all. If "b" is empty, we just use "a" as the result (it

2051 * might be empty fine, but that's fine). If "a" is empty but "b" is not,

2052 * we use "b" as the result (but we have to copy the data into "a" first).

2053 *

2054 * Only when both ranges are non-empty, we actually do the per-key merge.

2055 */

2056

2057 /* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */

2058 if (db->bt_empty_range)

2059 {

2060 /* skip the per-key merge */

2061 MemoryContextDelete(cxt);

2062 return;

2063 }

2064

2065 /*

2066 * Now we know "b" is not empty. If "a" is empty, then "b" is the result.

2067 * But we need to copy the data from "b" to "a" first, because that's how

2068 * we pass result out.

2069 *

2070 * We have to copy all the global/per-key flags etc. too.

2071 */

2072 if (a->bt_empty_range)

2073 {

2074 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)

2075 {

2076 int i;

2077 BrinValues *col_a = &a->bt_columns[keyno];

2078 BrinValues *col_b = &db->bt_columns[keyno];

2079 BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];

2080

2081 col_a->bv_allnulls = col_b->bv_allnulls;

2082 col_a->bv_hasnulls = col_b->bv_hasnulls;

2083

2084 /* If "b" has no data, we're done. */

2085 if (col_b->bv_allnulls)

2086 continue;

2087

2088 for (i = 0; i < opcinfo->oi_nstored; i++)

2089 col_a->bv_values[i] =

2090 datumCopy(col_b->bv_values[i],

2091 opcinfo->oi_typcache[i]->typbyval,

2092 opcinfo->oi_typcache[i]->typlen);

2093 }

2094

2095 /* "a" started empty, but "b" was not empty, so remember that */

2096 a->bt_empty_range = false;

2097

2098 /* skip the per-key merge */

2099 MemoryContextDelete(cxt);

2100 return;

2101 }

2102

2103 /* Now we know neither range is empty. */

2104 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)

2105 {

2106 FmgrInfo *unionFn;

2107 BrinValues *col_a = &a->bt_columns[keyno];

2108 BrinValues *col_b = &db->bt_columns[keyno];

2109 BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];

2110

2111 if (opcinfo->oi_regular_nulls)

2112 {

2113 /* Does the "b" summary represent any NULL values? */

2114 bool b_has_nulls = (col_b->bv_hasnulls || col_b->bv_allnulls);

2115

2116 /* Adjust "hasnulls". */

2117 if (!col_a->bv_allnulls && b_has_nulls)

2118 col_a->bv_hasnulls = true;

2119

2120 /* If there are no values in B, there's nothing left to do. */

2121 if (col_b->bv_allnulls)

2122 continue;

2123

2124 /*

2125 * Adjust "allnulls". If A doesn't have values, just copy the

2126 * values from B into A, and we're done. We cannot run the

2127 * operators in this case, because values in A might contain

2128 * garbage. Note we already established that B contains values.

2129 *

2130 * Also adjust "hasnulls" in order not to forget the summary

2131 * represents NULL values. This is not redundant with the earlier

2132 * update, because that only happens when allnulls=false.

2133 */

2134 if (col_a->bv_allnulls)

2135 {

2136 int i;

2137

2138 col_a->bv_allnulls = false;

2139 col_a->bv_hasnulls = true;

2140

2141 for (i = 0; i < opcinfo->oi_nstored; i++)

2142 col_a->bv_values[i] =

2143 datumCopy(col_b->bv_values[i],

2144 opcinfo->oi_typcache[i]->typbyval,

2145 opcinfo->oi_typcache[i]->typlen);

2146

2147 continue;

2148 }

2149 }

2150

2151 unionFn = index_getprocinfo(bdesc->bd_index, keyno + 1,

2152 BRIN_PROCNUM_UNION);

2153 FunctionCall3Coll(unionFn,

2154 bdesc->bd_index->rd_indcollation[keyno],

2155 PointerGetDatum(bdesc),

2156 PointerGetDatum(col_a),

2157 PointerGetDatum(col_b));

2158 }

2159

2160 MemoryContextDelete(cxt);

2161}

2162

2163/*

2164 * brin_vacuum_scan

2165 * Do a complete scan of the index during VACUUM.

2166 *

2167 * This routine scans the complete index looking for uncataloged index pages,

2168 * i.e. those that might have been lost due to a crash after index extension

2169 * and such.

2170 */

2171static void

2172 brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)

2173{

2174 BlockNumber nblocks;

2175 BlockNumber blkno;

2176

2177 /*

2178 * Scan the index in physical order, and clean up any possible mess in

2179 * each page.

2180 */

2181 nblocks = RelationGetNumberOfBlocks(idxrel);

2182 for (blkno = 0; blkno < nblocks; blkno++)

2183 {

2184 Buffer buf;

2185

2186 CHECK_FOR_INTERRUPTS();

2187

2188 buf = ReadBufferExtended(idxrel, MAIN_FORKNUM, blkno,

2189 RBM_NORMAL, strategy);

2190

2191 brin_page_cleanup(idxrel, buf);

2192

2193 ReleaseBuffer(buf);

2194 }

2195

2196 /*

2197 * Update all upper pages in the index's FSM, as well. This ensures not

2198 * only that we propagate leaf-page FSM updates made by brin_page_cleanup,

2199 * but also that any pre-existing damage or out-of-dateness is repaired.

2200 */

2201 FreeSpaceMapVacuum(idxrel);

2202}

2203

2204static bool

2205 add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup,

2206 const Datum *values, const bool *nulls)

2207{

2208 int keyno;

2209

2210 /* If the range starts empty, we're certainly going to modify it. */

2211 bool modified = dtup->bt_empty_range;

2212

2213 /*

2214 * Compare the key values of the new tuple to the stored index values; our

2215 * deformed tuple will get updated if the new tuple doesn't fit the

2216 * original range (note this means we can't break out of the loop early).

2217 * Make a note of whether this happens, so that we know to insert the

2218 * modified tuple later.

2219 */

2220 for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)

2221 {

2222 Datum result;

2223 BrinValues *bval;

2224 FmgrInfo *addValue;

2225 bool has_nulls;

2226

2227 bval = &dtup->bt_columns[keyno];

2228

2229 /*

2230 * Does the range have actual NULL values? Either of the flags can be

2231 * set, but we ignore the state before adding first row.

2232 *

2233 * We have to remember this, because we'll modify the flags and we

2234 * need to know if the range started as empty.

2235 */

2236 has_nulls = ((!dtup->bt_empty_range) &&

2237 (bval->bv_hasnulls || bval->bv_allnulls));

2238

2239 /*

2240 * If the value we're adding is NULL, handle it locally. Otherwise

2241 * call the BRIN_PROCNUM_ADDVALUE procedure.

2242 */

2243 if (bdesc->bd_info[keyno]->oi_regular_nulls && nulls[keyno])

2244 {

2245 /*

2246 * If the new value is null, we record that we saw it if it's the

2247 * first one; otherwise, there's nothing to do.

2248 */

2249 if (!bval->bv_hasnulls)

2250 {

2251 bval->bv_hasnulls = true;

2252 modified = true;

2253 }

2254

2255 continue;

2256 }

2257

2258 addValue = index_getprocinfo(idxRel, keyno + 1,

2259 BRIN_PROCNUM_ADDVALUE);

2260 result = FunctionCall4Coll(addValue,

2261 idxRel->rd_indcollation[keyno],

2262 PointerGetDatum(bdesc),

2263 PointerGetDatum(bval),

2264 values[keyno],

2265 BoolGetDatum(nulls[keyno]));

2266 /* if that returned true, we need to insert the updated tuple */

2267 modified |= DatumGetBool(result);

2268

2269 /*

2270 * If the range was had actual NULL values (i.e. did not start empty),

2271 * make sure we don't forget about the NULL values. Either the

2272 * allnulls flag is still set to true, or (if the opclass cleared it)

2273 * we need to set hasnulls=true.

2274 *

2275 * XXX This can only happen when the opclass modified the tuple, so

2276 * the modified flag should be set.

2277 */

2278 if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))

2279 {

2280 Assert(modified);

2281 bval->bv_hasnulls = true;

2282 }

2283 }

2284

2285 /*

2286 * After updating summaries for all the keys, mark it as not empty.

2287 *

2288 * If we're actually changing the flag value (i.e. tuple started as

2289 * empty), we should have modified the tuple. So we should not see empty

2290 * range that was not modified.

2291 */

2292 Assert(!dtup->bt_empty_range || modified);

2293 dtup->bt_empty_range = false;

2294

2295 return modified;

2296}

2297

2298static bool

2299 check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)

2300{

2301 int keyno;

2302

2303 /*

2304 * First check if there are any IS [NOT] NULL scan keys, and if we're

2305 * violating them.

2306 */

2307 for (keyno = 0; keyno < nnullkeys; keyno++)

2308 {

2309 ScanKey key = nullkeys[keyno];

2310

2311 Assert(key->sk_attno == bval->bv_attno);

2312

2313 /* Handle only IS NULL/IS NOT NULL tests */

2314 if (!(key->sk_flags & SK_ISNULL))

2315 continue;

2316

2317 if (key->sk_flags & SK_SEARCHNULL)

2318 {

2319 /* IS NULL scan key, but range has no NULLs */

2320 if (!bval->bv_allnulls && !bval->bv_hasnulls)

2321 return false;

2322 }

2323 else if (key->sk_flags & SK_SEARCHNOTNULL)

2324 {

2325 /*

2326 * For IS NOT NULL, we can only skip ranges that are known to have

2327 * only nulls.

2328 */

2329 if (bval->bv_allnulls)

2330 return false;

2331 }

2332 else

2333 {

2334 /*

2335 * Neither IS NULL nor IS NOT NULL was used; assume all indexable

2336 * operators are strict and thus return false with NULL value in

2337 * the scan key.

2338 */

2339 return false;

2340 }

2341 }

2342

2343 return true;

2344}

2345

2346/*

2347 * Create parallel context, and launch workers for leader.

2348 *

2349 * buildstate argument should be initialized (with the exception of the

2350 * tuplesort states, which may later be created based on shared

2351 * state initially set up here).

2352 *

2353 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.

2354 *

2355 * request is the target number of parallel worker processes to launch.

2356 *

2357 * Sets buildstate's BrinLeader, which caller must use to shut down parallel

2358 * mode by passing it to _brin_end_parallel() at the very end of its index

2359 * build. If not even a single worker process can be launched, this is

2360 * never set, and caller should proceed with a serial index build.

2361 */

2362static void

2363 _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,

2364 bool isconcurrent, int request)

2365{

2366 ParallelContext *pcxt;

2367 int scantuplesortstates;

2368 Snapshot snapshot;

2369 Size estbrinshared;

2370 Size estsort;

2371 BrinShared *brinshared;

2372 Sharedsort *sharedsort;

2373 BrinLeader *brinleader = (BrinLeader *) palloc0(sizeof(BrinLeader));

2374 WalUsage *walusage;

2375 BufferUsage *bufferusage;

2376 bool leaderparticipates = true;

2377 int querylen;

2378

2379#ifdef DISABLE_LEADER_PARTICIPATION

2380 leaderparticipates = false;

2381#endif

2382

2383 /*

2384 * Enter parallel mode, and create context for parallel build of brin

2385 * index

2386 */

2387 EnterParallelMode();

2388 Assert(request > 0);

2389 pcxt = CreateParallelContext("postgres", "_brin_parallel_build_main",

2390 request);

2391

2392 scantuplesortstates = leaderparticipates ? request + 1 : request;

2393

2394 /*

2395 * Prepare for scan of the base relation. In a normal index build, we use

2396 * SnapshotAny because we must retrieve all tuples and do our own time

2397 * qual checks (because we have to index RECENTLY_DEAD tuples). In a

2398 * concurrent build, we take a regular MVCC snapshot and index whatever's

2399 * live according to that.

2400 */

2401 if (!isconcurrent)

2402 snapshot = SnapshotAny;

2403 else

2404 snapshot = RegisterSnapshot(GetTransactionSnapshot());

2405

2406 /*

2407 * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.

2408 */

2409 estbrinshared = _brin_parallel_estimate_shared(heap, snapshot);

2410 shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared);

2411 estsort = tuplesort_estimate_shared(scantuplesortstates);

2412 shm_toc_estimate_chunk(&pcxt->estimator, estsort);

2413

2414 shm_toc_estimate_keys(&pcxt->estimator, 2);

2415

2416 /*

2417 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE

2418 * and PARALLEL_KEY_BUFFER_USAGE.

2419 *

2420 * If there are no extensions loaded that care, we could skip this. We

2421 * have no way of knowing whether anyone's looking at pgWalUsage or

2422 * pgBufferUsage, so do it unconditionally.

2423 */

2424 shm_toc_estimate_chunk(&pcxt->estimator,

2425 mul_size(sizeof(WalUsage), pcxt->nworkers));

2426 shm_toc_estimate_keys(&pcxt->estimator, 1);

2427 shm_toc_estimate_chunk(&pcxt->estimator,

2428 mul_size(sizeof(BufferUsage), pcxt->nworkers));

2429 shm_toc_estimate_keys(&pcxt->estimator, 1);

2430

2431 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */

2432 if (debug_query_string)

2433 {

2434 querylen = strlen(debug_query_string);

2435 shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1);

2436 shm_toc_estimate_keys(&pcxt->estimator, 1);

2437 }

2438 else

2439 querylen = 0; /* keep compiler quiet */

2440

2441 /* Everyone's had a chance to ask for space, so now create the DSM */

2442 InitializeParallelDSM(pcxt);

2443

2444 /* If no DSM segment was available, back out (do serial build) */

2445 if (pcxt->seg == NULL)

2446 {

2447 if (IsMVCCSnapshot(snapshot))

2448 UnregisterSnapshot(snapshot);

2449 DestroyParallelContext(pcxt);

2450 ExitParallelMode();

2451 return;

2452 }

2453

2454 /* Store shared build state, for which we reserved space */

2455 brinshared = (BrinShared *) shm_toc_allocate(pcxt->toc, estbrinshared);

2456 /* Initialize immutable state */

2457 brinshared->heaprelid = RelationGetRelid(heap);

2458 brinshared->indexrelid = RelationGetRelid(index);

2459 brinshared->isconcurrent = isconcurrent;

2460 brinshared->scantuplesortstates = scantuplesortstates;

2461 brinshared->pagesPerRange = buildstate->bs_pagesPerRange;

2462 brinshared->queryid = pgstat_get_my_query_id();

2463 ConditionVariableInit(&brinshared->workersdonecv);

2464 SpinLockInit(&brinshared->mutex);

2465

2466 /* Initialize mutable state */

2467 brinshared->nparticipantsdone = 0;

2468 brinshared->reltuples = 0.0;

2469 brinshared->indtuples = 0.0;

2470

2471 table_parallelscan_initialize(heap,

2472 ParallelTableScanFromBrinShared(brinshared),

2473 snapshot);

2474

2475 /*

2476 * Store shared tuplesort-private state, for which we reserved space.

2477 * Then, initialize opaque state using tuplesort routine.

2478 */

2479 sharedsort = (Sharedsort *) shm_toc_allocate(pcxt->toc, estsort);

2480 tuplesort_initialize_shared(sharedsort, scantuplesortstates,

2481 pcxt->seg);

2482

2483 /*

2484 * Store shared tuplesort-private state, for which we reserved space.

2485 * Then, initialize opaque state using tuplesort routine.

2486 */

2487 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BRIN_SHARED, brinshared);

2488 shm_toc_insert(pcxt->toc, PARALLEL_KEY_TUPLESORT, sharedsort);

2489

2490 /* Store query string for workers */

2491 if (debug_query_string)

2492 {

2493 char *sharedquery;

2494

2495 sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1);

2496 memcpy(sharedquery, debug_query_string, querylen + 1);

2497 shm_toc_insert(pcxt->toc, PARALLEL_KEY_QUERY_TEXT, sharedquery);

2498 }

2499

2500 /*

2501 * Allocate space for each worker's WalUsage and BufferUsage; no need to

2502 * initialize.

2503 */

2504 walusage = shm_toc_allocate(pcxt->toc,

2505 mul_size(sizeof(WalUsage), pcxt->nworkers));

2506 shm_toc_insert(pcxt->toc, PARALLEL_KEY_WAL_USAGE, walusage);

2507 bufferusage = shm_toc_allocate(pcxt->toc,

2508 mul_size(sizeof(BufferUsage), pcxt->nworkers));

2509 shm_toc_insert(pcxt->toc, PARALLEL_KEY_BUFFER_USAGE, bufferusage);

2510

2511 /* Launch workers, saving status for leader/caller */

2512 LaunchParallelWorkers(pcxt);

2513 brinleader->pcxt = pcxt;

2514 brinleader->nparticipanttuplesorts = pcxt->nworkers_launched;

2515 if (leaderparticipates)

2516 brinleader->nparticipanttuplesorts++;

2517 brinleader->brinshared = brinshared;

2518 brinleader->sharedsort = sharedsort;

2519 brinleader->snapshot = snapshot;

2520 brinleader->walusage = walusage;

2521 brinleader->bufferusage = bufferusage;

2522

2523 /* If no workers were successfully launched, back out (do serial build) */

2524 if (pcxt->nworkers_launched == 0)

2525 {

2526 _brin_end_parallel(brinleader, NULL);

2527 return;

2528 }

2529

2530 /* Save leader state now that it's clear build will be parallel */

2531 buildstate->bs_leader = brinleader;

2532

2533 /* Join heap scan ourselves */

2534 if (leaderparticipates)

2535 _brin_leader_participate_as_worker(buildstate, heap, index);

2536

2537 /*

2538 * Caller needs to wait for all launched workers when we return. Make

2539 * sure that the failure-to-start case will not hang forever.

2540 */

2541 WaitForParallelWorkersToAttach(pcxt);

2542}

2543

2544/*

2545 * Shut down workers, destroy parallel context, and end parallel mode.

2546 */

2547static void

2548 _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)

2549{

2550 int i;

2551

2552 /* Shutdown worker processes */

2553 WaitForParallelWorkersToFinish(brinleader->pcxt);

2554

2555 /*

2556 * Next, accumulate WAL usage. (This must wait for the workers to finish,

2557 * or we might get incomplete data.)

2558 */

2559 for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)

2560 InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);

2561

2562 /* Free last reference to MVCC snapshot, if one was used */

2563 if (IsMVCCSnapshot(brinleader->snapshot))

2564 UnregisterSnapshot(brinleader->snapshot);

2565 DestroyParallelContext(brinleader->pcxt);

2566 ExitParallelMode();

2567}

2568

2569/*

2570 * Within leader, wait for end of heap scan.

2571 *

2572 * When called, parallel heap scan started by _brin_begin_parallel() will

2573 * already be underway within worker processes (when leader participates

2574 * as a worker, we should end up here just as workers are finishing).

2575 *

2576 * Returns the total number of heap tuples scanned.

2577 */

2578static double

2579 _brin_parallel_heapscan(BrinBuildState *state)

2580{

2581 BrinShared *brinshared = state->bs_leader->brinshared;

2582 int nparticipanttuplesorts;

2583

2584 nparticipanttuplesorts = state->bs_leader->nparticipanttuplesorts;

2585 for (;;)

2586 {

2587 SpinLockAcquire(&brinshared->mutex);

2588 if (brinshared->nparticipantsdone == nparticipanttuplesorts)

2589 {

2590 /* copy the data into leader state */

2591 state->bs_reltuples = brinshared->reltuples;

2592 state->bs_numtuples = brinshared->indtuples;

2593

2594 SpinLockRelease(&brinshared->mutex);

2595 break;

2596 }

2597 SpinLockRelease(&brinshared->mutex);

2598

2599 ConditionVariableSleep(&brinshared->workersdonecv,

2600 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);

2601 }

2602

2603 ConditionVariableCancelSleep();

2604

2605 return state->bs_reltuples;

2606}

2607

2608/*

2609 * Within leader, wait for end of heap scan and merge per-worker results.

2610 *

2611 * After waiting for all workers to finish, merge the per-worker results into

2612 * the complete index. The results from each worker are sorted by block number

2613 * (start of the page range). While combining the per-worker results we merge

2614 * summaries for the same page range, and also fill-in empty summaries for

2615 * ranges without any tuples.

2616 *

2617 * Returns the total number of heap tuples scanned.

2618 */

2619static double

2620 _brin_parallel_merge(BrinBuildState *state)

2621{

2622 BrinTuple *btup;

2623 BrinMemTuple *memtuple = NULL;

2624 Size tuplen;

2625 BlockNumber prevblkno = InvalidBlockNumber;

2626 MemoryContext rangeCxt,

2627 oldCxt;

2628 double reltuples;

2629

2630 /* wait for workers to scan table and produce partial results */

2631 reltuples = _brin_parallel_heapscan(state);

2632

2633 /* do the actual sort in the leader */

2634 tuplesort_performsort(state->bs_sortstate);

2635

2636 /*

2637 * Initialize BrinMemTuple we'll use to union summaries from workers (in

2638 * case they happened to produce parts of the same page range).

2639 */

2640 memtuple = brin_new_memtuple(state->bs_bdesc);

2641

2642 /*

2643 * Create a memory context we'll reset to combine results for a single

2644 * page range (received from the workers). We don't expect huge number of

2645 * overlaps under regular circumstances, because for large tables the

2646 * chunk size is likely larger than the BRIN page range), but it can

2647 * happen, and the union functions may do all kinds of stuff. So we better

2648 * reset the context once in a while.

2649 */

2650 rangeCxt = AllocSetContextCreate(CurrentMemoryContext,

2651 "brin union",

2652 ALLOCSET_DEFAULT_SIZES);

2653 oldCxt = MemoryContextSwitchTo(rangeCxt);

2654

2655 /*

2656 * Read the BRIN tuples from the shared tuplesort, sorted by block number.

2657 * That probably gives us an index that is cheaper to scan, thanks to

2658 * mostly getting data from the same index page as before.

2659 */

2660 while ((btup = tuplesort_getbrintuple(state->bs_sortstate, &tuplen, true)) != NULL)

2661 {

2662 /* Ranges should be multiples of pages_per_range for the index. */

2663 Assert(btup->bt_blkno % state->bs_leader->brinshared->pagesPerRange == 0);

2664

2665 /*

2666 * Do we need to union summaries for the same page range?

2667 *

2668 * If this is the first brin tuple we read, then just deform it into

2669 * the memtuple, and continue with the next one from tuplesort. We

2670 * however may need to insert empty summaries into the index.

2671 *

2672 * If it's the same block as the last we saw, we simply union the brin

2673 * tuple into it, and we're done - we don't even need to insert empty

2674 * ranges, because that was done earlier when we saw the first brin

2675 * tuple (for this range).

2676 *

2677 * Finally, if it's not the first brin tuple, and it's not the same

2678 * page range, we need to do the insert and then deform the tuple into

2679 * the memtuple. Then we'll insert empty ranges before the new brin

2680 * tuple, if needed.

2681 */

2682 if (prevblkno == InvalidBlockNumber)

2683 {

2684 /* First brin tuples, just deform into memtuple. */

2685 memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);

2686

2687 /* continue to insert empty pages before thisblock */

2688 }

2689 else if (memtuple->bt_blkno == btup->bt_blkno)

2690 {

2691 /*

2692 * Not the first brin tuple, but same page range as the previous

2693 * one, so we can merge it into the memtuple.

2694 */

2695 union_tuples(state->bs_bdesc, memtuple, btup);

2696 continue;

2697 }

2698 else

2699 {

2700 BrinTuple *tmp;

2701 Size len;

2702

2703 /*

2704 * We got brin tuple for a different page range, so form a brin

2705 * tuple from the memtuple, insert it, and re-init the memtuple

2706 * from the new brin tuple.

2707 */

2708 tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,

2709 memtuple, &len);

2710

2711 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,

2712 &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);

2713

2714 /*

2715 * Reset the per-output-range context. This frees all the memory

2716 * possibly allocated by the union functions, and also the BRIN

2717 * tuple we just formed and inserted.

2718 */

2719 MemoryContextReset(rangeCxt);

2720

2721 memtuple = brin_deform_tuple(state->bs_bdesc, btup, memtuple);

2722

2723 /* continue to insert empty pages before thisblock */

2724 }

2725

2726 /* Fill empty ranges for all ranges missing in the tuplesort. */

2727 brin_fill_empty_ranges(state, prevblkno, btup->bt_blkno);

2728

2729 prevblkno = btup->bt_blkno;

2730 }

2731

2732 tuplesort_end(state->bs_sortstate);

2733

2734 /* Fill the BRIN tuple for the last page range with data. */

2735 if (prevblkno != InvalidBlockNumber)

2736 {

2737 BrinTuple *tmp;

2738 Size len;

2739

2740 tmp = brin_form_tuple(state->bs_bdesc, memtuple->bt_blkno,

2741 memtuple, &len);

2742

2743 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,

2744 &state->bs_currentInsertBuf, tmp->bt_blkno, tmp, len);

2745

2746 pfree(tmp);

2747 }

2748

2749 /* Fill empty ranges at the end, for all ranges missing in the tuplesort. */

2750 brin_fill_empty_ranges(state, prevblkno, state->bs_maxRangeStart);

2751

2752 /*

2753 * Switch back to the original memory context, and destroy the one we

2754 * created to isolate the union_tuple calls.

2755 */

2756 MemoryContextSwitchTo(oldCxt);

2757 MemoryContextDelete(rangeCxt);

2758

2759 return reltuples;

2760}

2761

2762/*

2763 * Returns size of shared memory required to store state for a parallel

2764 * brin index build based on the snapshot its parallel scan will use.

2765 */

2766static Size

2767 _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)

2768{

2769 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */

2770 return add_size(BUFFERALIGN(sizeof(BrinShared)),

2771 table_parallelscan_estimate(heap, snapshot));

2772}

2773

2774/*

2775 * Within leader, participate as a parallel worker.

2776 */

2777static void

2778 _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)

2779{

2780 BrinLeader *brinleader = buildstate->bs_leader;

2781 int sortmem;

2782

2783 /*

2784 * Might as well use reliable figure when doling out maintenance_work_mem

2785 * (when requested number of workers were not launched, this will be

2786 * somewhat higher than it is for other workers).

2787 */

2788 sortmem = maintenance_work_mem / brinleader->nparticipanttuplesorts;

2789

2790 /* Perform work common to all participants */

2791 _brin_parallel_scan_and_build(buildstate, brinleader->brinshared,

2792 brinleader->sharedsort, heap, index, sortmem, true);

2793}

2794

2795/*

2796 * Perform a worker's portion of a parallel sort.

2797 *

2798 * This generates a tuplesort for the worker portion of the table.

2799 *

2800 * sortmem is the amount of working memory to use within each worker,

2801 * expressed in KBs.

2802 *

2803 * When this returns, workers are done, and need only release resources.

2804 */

2805static void

2806 _brin_parallel_scan_and_build(BrinBuildState *state,

2807 BrinShared *brinshared, Sharedsort *sharedsort,

2808 Relation heap, Relation index,

2809 int sortmem, bool progress)

2810{

2811 SortCoordinate coordinate;

2812 TableScanDesc scan;

2813 double reltuples;

2814 IndexInfo *indexInfo;

2815

2816 /* Initialize local tuplesort coordination state */

2817 coordinate = palloc0(sizeof(SortCoordinateData));

2818 coordinate->isWorker = true;

2819 coordinate->nParticipants = -1;

2820 coordinate->sharedsort = sharedsort;

2821

2822 /* Begin "partial" tuplesort */

2823 state->bs_sortstate = tuplesort_begin_index_brin(sortmem, coordinate,

2824 TUPLESORT_NONE);

2825

2826 /* Join parallel scan */

2827 indexInfo = BuildIndexInfo(index);

2828 indexInfo->ii_Concurrent = brinshared->isconcurrent;

2829

2830 scan = table_beginscan_parallel(heap,

2831 ParallelTableScanFromBrinShared(brinshared));

2832

2833 reltuples = table_index_build_scan(heap, index, indexInfo, true, true,

2834 brinbuildCallbackParallel, state, scan);

2835

2836 /* insert the last item */

2837 form_and_spill_tuple(state);

2838

2839 /* sort the BRIN ranges built by this worker */

2840 tuplesort_performsort(state->bs_sortstate);

2841

2842 state->bs_reltuples += reltuples;

2843

2844 /*

2845 * Done. Record ambuild statistics.

2846 */

2847 SpinLockAcquire(&brinshared->mutex);

2848 brinshared->nparticipantsdone++;

2849 brinshared->reltuples += state->bs_reltuples;

2850 brinshared->indtuples += state->bs_numtuples;

2851 SpinLockRelease(&brinshared->mutex);

2852

2853 /* Notify leader */

2854 ConditionVariableSignal(&brinshared->workersdonecv);

2855

2856 tuplesort_end(state->bs_sortstate);

2857}

2858

2859/*

2860 * Perform work within a launched parallel process.

2861 */

2862void

2863 _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)

2864{

2865 char *sharedquery;

2866 BrinShared *brinshared;

2867 Sharedsort *sharedsort;

2868 BrinBuildState *buildstate;

2869 Relation heapRel;

2870 Relation indexRel;

2871 LOCKMODE heapLockmode;

2872 LOCKMODE indexLockmode;

2873 WalUsage *walusage;

2874 BufferUsage *bufferusage;

2875 int sortmem;

2876

2877 /*

2878 * The only possible status flag that can be set to the parallel worker is

2879 * PROC_IN_SAFE_IC.

2880 */

2881 Assert((MyProc->statusFlags == 0) ||

2882 (MyProc->statusFlags == PROC_IN_SAFE_IC));

2883

2884 /* Set debug_query_string for individual workers first */

2885 sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true);

2886 debug_query_string = sharedquery;

2887

2888 /* Report the query string from leader */

2889 pgstat_report_activity(STATE_RUNNING, debug_query_string);

2890

2891 /* Look up brin shared state */

2892 brinshared = shm_toc_lookup(toc, PARALLEL_KEY_BRIN_SHARED, false);

2893

2894 /* Open relations using lock modes known to be obtained by index.c */

2895 if (!brinshared->isconcurrent)

2896 {

2897 heapLockmode = ShareLock;

2898 indexLockmode = AccessExclusiveLock;

2899 }

2900 else

2901 {

2902 heapLockmode = ShareUpdateExclusiveLock;

2903 indexLockmode = RowExclusiveLock;

2904 }

2905

2906 /* Track query ID */

2907 pgstat_report_query_id(brinshared->queryid, false);

2908

2909 /* Open relations within worker */

2910 heapRel = table_open(brinshared->heaprelid, heapLockmode);

2911 indexRel = index_open(brinshared->indexrelid, indexLockmode);

2912

2913 buildstate = initialize_brin_buildstate(indexRel, NULL,

2914 brinshared->pagesPerRange,

2915 InvalidBlockNumber);

2916

2917 /* Look up shared state private to tuplesort.c */

2918 sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false);

2919 tuplesort_attach_shared(sharedsort, seg);

2920

2921 /* Prepare to track buffer usage during parallel execution */

2922 InstrStartParallelQuery();

2923

2924 /*

2925 * Might as well use reliable figure when doling out maintenance_work_mem

2926 * (when requested number of workers were not launched, this will be

2927 * somewhat higher than it is for other workers).

2928 */

2929 sortmem = maintenance_work_mem / brinshared->scantuplesortstates;

2930

2931 _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,

2932 heapRel, indexRel, sortmem, false);

2933

2934 /* Report WAL/buffer usage during parallel execution */

2935 bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);

2936 walusage = shm_toc_lookup(toc, PARALLEL_KEY_WAL_USAGE, false);

2937 InstrEndParallelQuery(&bufferusage[ParallelWorkerNumber],

2938 &walusage[ParallelWorkerNumber]);

2939

2940 index_close(indexRel, indexLockmode);

2941 table_close(heapRel, heapLockmode);

2942}

2943

2944/*

2945 * brin_build_empty_tuple

2946 * Maybe initialize a BRIN tuple representing empty range.

2947 *

2948 * Returns a BRIN tuple representing an empty page range starting at the

2949 * specified block number. The empty tuple is initialized only once, when it's

2950 * needed for the first time, stored in the memory context bs_context to ensure

2951 * proper life span, and reused on following calls. All empty tuples are

2952 * exactly the same except for the bt_blkno field, which is set to the value

2953 * in blkno parameter.

2954 */

2955static void

2956 brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)

2957{

2958 /* First time an empty tuple is requested? If yes, initialize it. */

2959 if (state->bs_emptyTuple == NULL)

2960 {

2961 MemoryContext oldcxt;

2962 BrinMemTuple *dtuple = brin_new_memtuple(state->bs_bdesc);

2963

2964 /* Allocate the tuple in context for the whole index build. */

2965 oldcxt = MemoryContextSwitchTo(state->bs_context);

2966

2967 state->bs_emptyTuple = brin_form_tuple(state->bs_bdesc, blkno, dtuple,

2968 &state->bs_emptyTupleLen);

2969

2970 MemoryContextSwitchTo(oldcxt);

2971 }

2972 else

2973 {

2974 /* If we already have an empty tuple, just update the block. */

2975 state->bs_emptyTuple->bt_blkno = blkno;

2976 }

2977}

2978

2979/*

2980 * brin_fill_empty_ranges

2981 * Add BRIN index tuples representing empty page ranges.

2982 *

2983 * prevRange/nextRange determine for which page ranges to add empty summaries.

2984 * Both boundaries are exclusive, i.e. only ranges starting at blkno for which

2985 * (prevRange < blkno < nextRange) will be added to the index.

2986 *

2987 * If prevRange is InvalidBlockNumber, this means there was no previous page

2988 * range (i.e. the first empty range to add is for blkno=0).

2989 *

2990 * The empty tuple is built only once, and then reused for all future calls.

2991 */

2992static void

2993 brin_fill_empty_ranges(BrinBuildState *state,

2994 BlockNumber prevRange, BlockNumber nextRange)

2995{

2996 BlockNumber blkno;

2997

2998 /*

2999 * If we already summarized some ranges, we need to start with the next

3000 * one. Otherwise start from the first range of the table.

3001 */

3002 blkno = (prevRange == InvalidBlockNumber) ? 0 : (prevRange + state->bs_pagesPerRange);

3003

3004 /* Generate empty ranges until we hit the next non-empty range. */

3005 while (blkno < nextRange)

3006 {

3007 /* Did we already build the empty tuple? If not, do it now. */

3008 brin_build_empty_tuple(state, blkno);

3009

3010 brin_doinsert(state->bs_irel, state->bs_pagesPerRange, state->bs_rmAccess,

3011 &state->bs_currentInsertBuf,

3012 blkno, state->bs_emptyTuple, state->bs_emptyTupleLen);

3013

3014 /* try next page range */

3015 blkno += state->bs_pagesPerRange;

3016 }

3017}

acl.h

ACLCHECK_NOT_OWNER

@ ACLCHECK_NOT_OWNER

Definition: acl.h:185

aclcheck_error

void aclcheck_error(AclResult aclerr, ObjectType objtype, const char *objectname)

Definition: aclchk.c:2652

object_ownercheck

bool object_ownercheck(Oid classid, Oid objectid, Oid roleid)

Definition: aclchk.c:4088

AttrNumber

int16 AttrNumber

Definition: attnum.h:21

validate

static bool validate(Port *port, const char *auth)

Definition: auth-oauth.c:638

AutoVacuumRequestWork

bool AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId, BlockNumber blkno)

Definition: autovacuum.c:3320

autovacuum.h

AVW_BRINSummarizeRange

@ AVW_BRINSummarizeRange

Definition: autovacuum.h:25

ParallelWorkerNumber

int ParallelWorkerNumber

Definition: parallel.c:115

InitializeParallelDSM

void InitializeParallelDSM(ParallelContext *pcxt)

Definition: parallel.c:211

WaitForParallelWorkersToFinish

void WaitForParallelWorkersToFinish(ParallelContext *pcxt)

Definition: parallel.c:796

LaunchParallelWorkers

void LaunchParallelWorkers(ParallelContext *pcxt)

Definition: parallel.c:573

DestroyParallelContext

void DestroyParallelContext(ParallelContext *pcxt)

Definition: parallel.c:950

CreateParallelContext

ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)

Definition: parallel.c:173

WaitForParallelWorkersToAttach

void WaitForParallelWorkersToAttach(ParallelContext *pcxt)

Definition: parallel.c:693

pgstat_report_query_id

void pgstat_report_query_id(int64 query_id, bool force)

Definition: backend_status.c:686

pgstat_get_my_query_id

int64 pgstat_get_my_query_id(void)

Definition: backend_status.c:1138

pgstat_report_activity

void pgstat_report_activity(BackendState state, const char *cmd_str)

Definition: backend_status.c:572

STATE_RUNNING

@ STATE_RUNNING

Definition: backend_status.h:29

BlockNumber

uint32 BlockNumber

Definition: block.h:31

InvalidBlockNumber

#define InvalidBlockNumber

Definition: block.h:33

MaxBlockNumber

#define MaxBlockNumber

Definition: block.h:35

values

static Datum values[MAXATTR]

Definition: bootstrap.c:153

PARALLEL_KEY_BUFFER_USAGE

#define PARALLEL_KEY_BUFFER_USAGE

Definition: brin.c:51

brininsertcleanup

void brininsertcleanup(Relation index, IndexInfo *indexInfo)

Definition: brin.c:512

_brin_parallel_merge

static double _brin_parallel_merge(BrinBuildState *state)

Definition: brin.c:2620

brin_vacuum_scan

static void brin_vacuum_scan(Relation idxrel, BufferAccessStrategy strategy)

Definition: brin.c:2172

brin_desummarize_range

Datum brin_desummarize_range(PG_FUNCTION_ARGS)

Definition: brin.c:1491

brinrescan

void brinrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys)

Definition: brin.c:959

terminate_brin_buildstate

static void terminate_brin_buildstate(BrinBuildState *state)

Definition: brin.c:1716

PARALLEL_KEY_BRIN_SHARED

#define PARALLEL_KEY_BRIN_SHARED

Definition: brin.c:47

brin_summarize_range

Datum brin_summarize_range(PG_FUNCTION_ARGS)

Definition: brin.c:1381

brinbulkdelete

IndexBulkDeleteResult * brinbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)

Definition: brin.c:1303

form_and_spill_tuple

static void form_and_spill_tuple(BrinBuildState *state)

Definition: brin.c:2006

BRIN_ALL_BLOCKRANGES

#define BRIN_ALL_BLOCKRANGES

Definition: brin.c:209

BrinShared

struct BrinShared BrinShared

brin_summarize_new_values

Datum brin_summarize_new_values(PG_FUNCTION_ARGS)

Definition: brin.c:1366

brinbeginscan

IndexScanDesc brinbeginscan(Relation r, int nkeys, int norderbys)

Definition: brin.c:539

brinoptions

bytea * brinoptions(Datum reloptions, bool validate)

Definition: brin.c:1348

bringetbitmap

int64 bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)

Definition: brin.c:567

brinsummarize

static void brinsummarize(Relation index, Relation heapRel, BlockNumber pageRange, bool include_partial, double *numSummarized, double *numExisting)

Definition: brin.c:1887

form_and_insert_tuple

static void form_and_insert_tuple(BrinBuildState *state)

Definition: brin.c:1985

brinbuildempty

void brinbuildempty(Relation index)

Definition: brin.c:1274

brin_free_desc

void brin_free_desc(BrinDesc *bdesc)

Definition: brin.c:1636

BrinInsertState

struct BrinInsertState BrinInsertState

union_tuples

static void union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)

Definition: brin.c:2031

_brin_parallel_scan_and_build

static void _brin_parallel_scan_and_build(BrinBuildState *state, BrinShared *brinshared, Sharedsort *sharedsort, Relation heap, Relation index, int sortmem, bool progress)

Definition: brin.c:2806

initialize_brin_buildstate

static BrinBuildState * initialize_brin_buildstate(Relation idxRel, BrinRevmap *revmap, BlockNumber pagesPerRange, BlockNumber tablePages)

Definition: brin.c:1669

_brin_begin_parallel

static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request)

Definition: brin.c:2363

brinGetStats

void brinGetStats(Relation index, BrinStatsData *stats)

Definition: brin.c:1648

_brin_leader_participate_as_worker

static void _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Relation index)

Definition: brin.c:2778

add_values_to_range

static bool add_values_to_range(Relation idxRel, BrinDesc *bdesc, BrinMemTuple *dtup, const Datum *values, const bool *nulls)

Definition: brin.c:2205

_brin_end_parallel

static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)

Definition: brin.c:2548

_brin_parallel_estimate_shared

static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)

Definition: brin.c:2767

BrinBuildState

struct BrinBuildState BrinBuildState

brin_fill_empty_ranges

static void brin_fill_empty_ranges(BrinBuildState *state, BlockNumber prevRange, BlockNumber nextRange)

Definition: brin.c:2993

brinbuild

IndexBuildResult * brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)

Definition: brin.c:1105

brinvacuumcleanup

IndexBulkDeleteResult * brinvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)

Definition: brin.c:1318

BrinLeader

struct BrinLeader BrinLeader

BrinOpaque

struct BrinOpaque BrinOpaque

summarize_range

static void summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel, BlockNumber heapBlk, BlockNumber heapNumBlks)

Definition: brin.c:1761

ParallelTableScanFromBrinShared

#define ParallelTableScanFromBrinShared(shared)

Definition: brin.c:116

PARALLEL_KEY_TUPLESORT

#define PARALLEL_KEY_TUPLESORT

Definition: brin.c:48

brinbuildCallbackParallel

static void brinbuildCallbackParallel(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *brstate)

Definition: brin.c:1046

brininsert

bool brininsert(Relation idxRel, Datum *values, bool *nulls, ItemPointer heaptid, Relation heapRel, IndexUniqueCheck checkUnique, bool indexUnchanged, IndexInfo *indexInfo)

Definition: brin.c:344

PARALLEL_KEY_QUERY_TEXT

#define PARALLEL_KEY_QUERY_TEXT

Definition: brin.c:49

brinhandler

Datum brinhandler(PG_FUNCTION_ARGS)

Definition: brin.c:250

brin_build_desc

BrinDesc * brin_build_desc(Relation rel)

Definition: brin.c:1581

_brin_parallel_build_main

void _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)

Definition: brin.c:2863

brin_build_empty_tuple

static void brin_build_empty_tuple(BrinBuildState *state, BlockNumber blkno)

Definition: brin.c:2956

PARALLEL_KEY_WAL_USAGE

#define PARALLEL_KEY_WAL_USAGE

Definition: brin.c:50

_brin_parallel_heapscan

static double _brin_parallel_heapscan(BrinBuildState *state)

Definition: brin.c:2579

initialize_brin_insertstate

static BrinInsertState * initialize_brin_insertstate(Relation idxRel, IndexInfo *indexInfo)

Definition: brin.c:315

brinbuildCallback

static void brinbuildCallback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *brstate)

Definition: brin.c:995

brinendscan

void brinendscan(IndexScanDesc scan)

Definition: brin.c:978

check_null_keys

static bool check_null_keys(BrinValues *bval, ScanKey *nullkeys, int nnullkeys)

Definition: brin.c:2299

brin.h

BrinGetPagesPerRange

#define BrinGetPagesPerRange(relation)

Definition: brin.h:41

BrinGetAutoSummarize

#define BrinGetAutoSummarize(relation)

Definition: brin.h:47

BRIN_LAST_OPTIONAL_PROCNUM

#define BRIN_LAST_OPTIONAL_PROCNUM

Definition: brin_internal.h:78

BRIN_PROCNUM_UNION

#define BRIN_PROCNUM_UNION

Definition: brin_internal.h:73

BRIN_PROCNUM_OPTIONS

#define BRIN_PROCNUM_OPTIONS

Definition: brin_internal.h:75

BRIN_PROCNUM_OPCINFO

#define BRIN_PROCNUM_OPCINFO

Definition: brin_internal.h:70

BRIN_PROCNUM_CONSISTENT

#define BRIN_PROCNUM_CONSISTENT

Definition: brin_internal.h:72

BRIN_elog

#define BRIN_elog(args)

Definition: brin_internal.h:85

BRIN_PROCNUM_ADDVALUE

#define BRIN_PROCNUM_ADDVALUE

Definition: brin_internal.h:71

brin_page.h

BRIN_CURRENT_VERSION

#define BRIN_CURRENT_VERSION

Definition: brin_page.h:72

BRIN_METAPAGE_BLKNO

#define BRIN_METAPAGE_BLKNO

Definition: brin_page.h:75

brin_doupdate

bool brin_doupdate(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, BlockNumber heapBlk, Buffer oldbuf, OffsetNumber oldoff, const BrinTuple *origtup, Size origsz, const BrinTuple *newtup, Size newsz, bool samepage)

Definition: brin_pageops.c:53

brin_page_cleanup

void brin_page_cleanup(Relation idxrel, Buffer buf)

Definition: brin_pageops.c:624

brin_doinsert

OffsetNumber brin_doinsert(Relation idxrel, BlockNumber pagesPerRange, BrinRevmap *revmap, Buffer *buffer, BlockNumber heapBlk, BrinTuple *tup, Size itemsz)

Definition: brin_pageops.c:342

brin_metapage_init

void brin_metapage_init(Page page, BlockNumber pagesPerRange, uint16 version)

Definition: brin_pageops.c:486

brin_can_do_samepage_update

bool brin_can_do_samepage_update(Buffer buffer, Size origsz, Size newsz)

Definition: brin_pageops.c:323

brin_pageops.h

brinRevmapDesummarizeRange

bool brinRevmapDesummarizeRange(Relation idxrel, BlockNumber heapBlk)

Definition: brin_revmap.c:323

brinRevmapTerminate

void brinRevmapTerminate(BrinRevmap *revmap)

Definition: brin_revmap.c:100

brinRevmapInitialize

BrinRevmap * brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)

Definition: brin_revmap.c:70

brinGetTupleForHeapBlock

BrinTuple * brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk, Buffer *buf, OffsetNumber *off, Size *size, int mode)

Definition: brin_revmap.c:194

brin_copy_tuple

BrinTuple * brin_copy_tuple(BrinTuple *tuple, Size len, BrinTuple *dest, Size *destsz)

Definition: brin_tuple.c:446

brin_form_tuple

BrinTuple * brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple, Size *size)

Definition: brin_tuple.c:99

brin_new_memtuple

BrinMemTuple * brin_new_memtuple(BrinDesc *brdesc)

Definition: brin_tuple.c:482

brin_free_tuple

void brin_free_tuple(BrinTuple *tuple)

Definition: brin_tuple.c:433

brin_form_placeholder_tuple

BrinTuple * brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)

Definition: brin_tuple.c:388

brin_memtuple_initialize

BrinMemTuple * brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)

Definition: brin_tuple.c:511

brin_deform_tuple

BrinMemTuple * brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)

Definition: brin_tuple.c:553

brinvalidate

bool brinvalidate(Oid opclassoid)

Definition: brin_validate.c:37

brin_xlog.h

SizeOfBrinCreateIdx

#define SizeOfBrinCreateIdx

Definition: brin_xlog.h:55

XLOG_BRIN_CREATE_INDEX

#define XLOG_BRIN_CREATE_INDEX

Definition: brin_xlog.h:31

Buffer

int Buffer

Definition: buf.h:23

BufferIsInvalid

#define BufferIsInvalid(buffer)

Definition: buf.h:31

InvalidBuffer

#define InvalidBuffer

Definition: buf.h:25

BufferGetBlockNumber

BlockNumber BufferGetBlockNumber(Buffer buffer)

Definition: bufmgr.c:4198

ExtendBufferedRel

Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)

Definition: bufmgr.c:858

ReleaseBuffer

void ReleaseBuffer(Buffer buffer)

Definition: bufmgr.c:5338

UnlockReleaseBuffer

void UnlockReleaseBuffer(Buffer buffer)

Definition: bufmgr.c:5355

MarkBufferDirty

void MarkBufferDirty(Buffer buffer)

Definition: bufmgr.c:2921

LockBuffer

void LockBuffer(Buffer buffer, int mode)

Definition: bufmgr.c:5572

ReadBufferExtended

Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)

Definition: bufmgr.c:805

ReadBuffer

Buffer ReadBuffer(Relation reln, BlockNumber blockNum)

Definition: bufmgr.c:758

bufmgr.h

BUFFER_LOCK_UNLOCK

#define BUFFER_LOCK_UNLOCK

Definition: bufmgr.h:196

BUFFER_LOCK_SHARE

#define BUFFER_LOCK_SHARE

Definition: bufmgr.h:197

RelationGetNumberOfBlocks

#define RelationGetNumberOfBlocks(reln)

Definition: bufmgr.h:283

BufferGetPage

static Page BufferGetPage(Buffer buffer)

Definition: bufmgr.h:417

EB_SKIP_EXTENSION_LOCK

@ EB_SKIP_EXTENSION_LOCK

Definition: bufmgr.h:75

EB_LOCK_FIRST

@ EB_LOCK_FIRST

Definition: bufmgr.h:87

RBM_NORMAL

@ RBM_NORMAL

Definition: bufmgr.h:46

BMR_REL

#define BMR_REL(p_rel)

Definition: bufmgr.h:111

BufferIsValid

static bool BufferIsValid(Buffer bufnum)

Definition: bufmgr.h:368

PageGetFreeSpace

Size PageGetFreeSpace(const PageData *page)

Definition: bufpage.c:906

PageGetContents

static char * PageGetContents(Page page)

Definition: bufpage.h:258

PageGetItemId

static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)

Definition: bufpage.h:244

PageSetLSN

static void PageSetLSN(Page page, XLogRecPtr lsn)

Definition: bufpage.h:391

Page

PageData * Page

Definition: bufpage.h:82

Min

#define Min(x, y)

Definition: c.h:1003

MAXALIGN

#define MAXALIGN(LEN)

Definition: c.h:810

PG_USED_FOR_ASSERTS_ONLY

#define PG_USED_FOR_ASSERTS_ONLY

Definition: c.h:223

BUFFERALIGN

#define BUFFERALIGN(LEN)

Definition: c.h:812

int64

int64_t int64

Definition: c.h:535

int32

int32_t int32

Definition: c.h:534

lengthof

#define lengthof(array)

Definition: c.h:787

OidIsValid

#define OidIsValid(objectId)

Definition: c.h:774

Size

size_t Size

Definition: c.h:610

ConditionVariableCancelSleep

bool ConditionVariableCancelSleep(void)

Definition: condition_variable.c:230

ConditionVariableInit

void ConditionVariableInit(ConditionVariable *cv)

Definition: condition_variable.c:35

ConditionVariableSleep

void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)

Definition: condition_variable.c:96

ConditionVariableSignal

void ConditionVariableSignal(ConditionVariable *cv)

Definition: condition_variable.c:259

datumCopy

Datum datumCopy(Datum value, bool typByVal, int typLen)

Definition: datum.c:132

datum.h

errhint

int errhint(const char *fmt,...)

Definition: elog.c:1321

errcode

int errcode(int sqlerrcode)

Definition: elog.c:854

errmsg

int errmsg(const char *fmt,...)

Definition: elog.c:1071

LOG

#define LOG

Definition: elog.h:31

DEBUG2

#define DEBUG2

Definition: elog.h:29

DEBUG1

#define DEBUG1

Definition: elog.h:30

ERROR

#define ERROR

Definition: elog.h:39

elog

#define elog(elevel,...)

Definition: elog.h:226

ereport

#define ereport(elevel,...)

Definition: elog.h:150

palloc_object

#define palloc_object(type)

Definition: fe_memutils.h:74

palloc_array

#define palloc_array(type, count)

Definition: fe_memutils.h:76

palloc0_array

#define palloc0_array(type, count)

Definition: fe_memutils.h:77

palloc0_object

#define palloc0_object(type)

Definition: fe_memutils.h:75

FunctionCall4Coll

Datum FunctionCall4Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3, Datum arg4)

Definition: fmgr.c:1197

FunctionCall3Coll

Datum FunctionCall3Coll(FmgrInfo *flinfo, Oid collation, Datum arg1, Datum arg2, Datum arg3)

Definition: fmgr.c:1172

fmgr_info_copy

void fmgr_info_copy(FmgrInfo *dstinfo, FmgrInfo *srcinfo, MemoryContext destcxt)

Definition: fmgr.c:581

PG_RETURN_VOID

#define PG_RETURN_VOID()

Definition: fmgr.h:349

PG_GETARG_OID

#define PG_GETARG_OID(n)

Definition: fmgr.h:275

DirectFunctionCall2

#define DirectFunctionCall2(func, arg1, arg2)

Definition: fmgr.h:684

PG_GETARG_DATUM

#define PG_GETARG_DATUM(n)

Definition: fmgr.h:268

PG_GETARG_INT64

#define PG_GETARG_INT64(n)

Definition: fmgr.h:283

FunctionCall1

#define FunctionCall1(flinfo, arg1)

Definition: fmgr.h:700

PG_RETURN_INT32

#define PG_RETURN_INT32(x)

Definition: fmgr.h:354

PG_RETURN_POINTER

#define PG_RETURN_POINTER(x)

Definition: fmgr.h:361

PG_FUNCTION_ARGS

#define PG_FUNCTION_ARGS

Definition: fmgr.h:193

FreeSpaceMapVacuumRange

void FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)

Definition: freespace.c:377

FreeSpaceMapVacuum

void FreeSpaceMapVacuum(Relation rel)

Definition: freespace.c:358

RecordPageWithFreeSpace

void RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)

Definition: freespace.c:194

freespace.h

RelationGetIndexScan

IndexScanDesc RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys)

Definition: genam.c:80

IndexBulkDeleteCallback

bool(* IndexBulkDeleteCallback)(ItemPointer itemptr, void *state)

Definition: genam.h:114

IndexUniqueCheck

Definition: genam.h:143

maintenance_work_mem

int maintenance_work_mem

Definition: globals.c:133

NewGUCNestLevel

int NewGUCNestLevel(void)

Definition: guc.c:2241

RestrictSearchPath

void RestrictSearchPath(void)

Definition: guc.c:2252

AtEOXact_GUC

void AtEOXact_GUC(bool isCommit, int nestLevel)

Definition: guc.c:2268

guc.h

Assert

Assert(PointerIsAligned(start, uint64))

IndexGetRelation

Oid IndexGetRelation(Oid indexId, bool missing_ok)

Definition: index.c:3583

BuildIndexInfo

IndexInfo * BuildIndexInfo(Relation index)

Definition: index.c:2428

index.h

index_selfuncs.h

index_getprocinfo

FmgrInfo * index_getprocinfo(Relation irel, AttrNumber attnum, uint16 procnum)

Definition: indexam.c:917

index_close

void index_close(Relation relation, LOCKMODE lockmode)

Definition: indexam.c:177

index_open

Relation index_open(Oid relationId, LOCKMODE lockmode)

Definition: indexam.c:133

InstrAccumParallelQuery

void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)

Definition: instrument.c:218

InstrEndParallelQuery

void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)

Definition: instrument.c:208

InstrStartParallelQuery

void InstrStartParallelQuery(void)

Definition: instrument.c:200

b

int b

Definition: isn.c:74

a

int a

Definition: isn.c:73

i

int i

Definition: isn.c:77

if

if(TABLE==NULL||TABLE_index==NULL)

Definition: isn.c:81

ItemIdGetLength

#define ItemIdGetLength(itemId)

Definition: itemid.h:59

ItemPointerGetOffsetNumber

static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)

Definition: itemptr.h:124

ItemPointerGetBlockNumber

static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)

Definition: itemptr.h:103

LOCKMODE

int LOCKMODE

Definition: lockdefs.h:26

AccessExclusiveLock

#define AccessExclusiveLock

Definition: lockdefs.h:43

AccessShareLock

#define AccessShareLock

Definition: lockdefs.h:36

ShareUpdateExclusiveLock

#define ShareUpdateExclusiveLock

Definition: lockdefs.h:39

ShareLock

#define ShareLock

Definition: lockdefs.h:40

RowExclusiveLock

#define RowExclusiveLock

Definition: lockdefs.h:38

MemoryContextReset

void MemoryContextReset(MemoryContext context)

Definition: mcxt.c:400

pfree

void pfree(void *pointer)

Definition: mcxt.c:1594

palloc0

void * palloc0(Size size)

Definition: mcxt.c:1395

palloc

void * palloc(Size size)

Definition: mcxt.c:1365

CurrentMemoryContext

MemoryContext CurrentMemoryContext

Definition: mcxt.c:160

MemoryContextDelete

void MemoryContextDelete(MemoryContext context)

Definition: mcxt.c:469

memutils.h

AllocSetContextCreate

#define AllocSetContextCreate

Definition: memutils.h:129

ALLOCSET_DEFAULT_SIZES

#define ALLOCSET_DEFAULT_SIZES

Definition: memutils.h:160

ALLOCSET_SMALL_SIZES

#define ALLOCSET_SMALL_SIZES

Definition: memutils.h:170

miscadmin.h

SECURITY_RESTRICTED_OPERATION

#define SECURITY_RESTRICTED_OPERATION

Definition: miscadmin.h:318

START_CRIT_SECTION

#define START_CRIT_SECTION()

Definition: miscadmin.h:149

CHECK_FOR_INTERRUPTS

#define CHECK_FOR_INTERRUPTS()

Definition: miscadmin.h:122

END_CRIT_SECTION

#define END_CRIT_SECTION()

Definition: miscadmin.h:151

GetUserIdAndSecContext

void GetUserIdAndSecContext(Oid *userid, int *sec_context)

Definition: miscinit.c:612

GetUserId

Oid GetUserId(void)

Definition: miscinit.c:469

SetUserIdAndSecContext

void SetUserIdAndSecContext(Oid userid, int sec_context)

Definition: miscinit.c:619

sort-test.key

key

Definition: sort-test.py:19

makeNode

#define makeNode(_type_)

Definition: nodes.h:161

OffsetNumber

uint16 OffsetNumber

Definition: off.h:24

FirstOffsetNumber

#define FirstOffsetNumber

Definition: off.h:27

MemoryContextSwitchTo

static MemoryContext MemoryContextSwitchTo(MemoryContext context)

Definition: palloc.h:124

OBJECT_INDEX

@ OBJECT_INDEX

Definition: parsenodes.h:2345

pg_am.h

Form_pg_attribute

FormData_pg_attribute * Form_pg_attribute

Definition: pg_attribute.h:202

len

const void size_t len

Definition: pg_crc32c_sse42.c:28

buf

static char * buf

Definition: pg_test_fsync.c:72

progress

static int progress

Definition: pgbench.c:262

ERRCODE_UNDEFINED_TABLE

#define ERRCODE_UNDEFINED_TABLE

Definition: pgbench.c:79

pgstat.h

pgstat_count_index_scan

#define pgstat_count_index_scan(rel)

Definition: pgstat.h:695

debug_query_string

const char * debug_query_string

Definition: postgres.c:88

postgres.h

Int64GetDatum

static Datum Int64GetDatum(int64 X)

Definition: postgres.h:403

DatumGetBool

static bool DatumGetBool(Datum X)

Definition: postgres.h:100

PointerGetDatum

static Datum PointerGetDatum(const void *X)

Definition: postgres.h:332

BoolGetDatum

static Datum BoolGetDatum(bool X)

Definition: postgres.h:112

ObjectIdGetDatum

static Datum ObjectIdGetDatum(Oid X)

Definition: postgres.h:262

Datum

uint64_t Datum

Definition: postgres.h:70

DatumGetPointer

static Pointer DatumGetPointer(Datum X)

Definition: postgres.h:322

Int32GetDatum

static Datum Int32GetDatum(int32 X)

Definition: postgres.h:222

InvalidOid

#define InvalidOid

Definition: postgres_ext.h:37

Oid

unsigned int Oid

Definition: postgres_ext.h:32

PROC_IN_SAFE_IC

#define PROC_IN_SAFE_IC

Definition: proc.h:59

addrange

static void addrange(struct cvec *cv, chr from, chr to)

Definition: regc_cvec.c:90

rel.h

RelationGetRelid

#define RelationGetRelid(relation)

Definition: rel.h:514

RelationGetDescr

#define RelationGetDescr(relation)

Definition: rel.h:540

RelationGetRelationName

#define RelationGetRelationName(relation)

Definition: rel.h:548

RelationNeedsWAL

#define RelationNeedsWAL(relation)

Definition: rel.h:637

relation.h

build_reloptions

void * build_reloptions(Datum reloptions, bool validate, relopt_kind kind, Size relopt_struct_size, const relopt_parse_elt *relopt_elems, int num_relopt_elems)

Definition: reloptions.c:1943

reloptions.h

RELOPT_KIND_BRIN

@ RELOPT_KIND_BRIN

Definition: reloptions.h:52

RELOPT_TYPE_INT

@ RELOPT_TYPE_INT

Definition: reloptions.h:32

RELOPT_TYPE_BOOL

@ RELOPT_TYPE_BOOL

Definition: reloptions.h:31

MAIN_FORKNUM

@ MAIN_FORKNUM

Definition: relpath.h:58

INIT_FORKNUM

@ INIT_FORKNUM

Definition: relpath.h:61

relscan.h

brincostestimate

void brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages)

Definition: selfuncs.c:8591

shm_toc_allocate

void * shm_toc_allocate(shm_toc *toc, Size nbytes)

Definition: shm_toc.c:88

shm_toc_insert

void shm_toc_insert(shm_toc *toc, uint64 key, void *address)

Definition: shm_toc.c:171

shm_toc_lookup

void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)

Definition: shm_toc.c:232

shm_toc_estimate_chunk

#define shm_toc_estimate_chunk(e, sz)

Definition: shm_toc.h:51

shm_toc_estimate_keys

#define shm_toc_estimate_keys(e, cnt)

Definition: shm_toc.h:53

add_size

Size add_size(Size s1, Size s2)

Definition: shmem.c:493

mul_size

Size mul_size(Size s1, Size s2)

Definition: shmem.c:510

SK_SEARCHNOTNULL

#define SK_SEARCHNOTNULL

Definition: skey.h:122

SK_SEARCHNULL

#define SK_SEARCHNULL

Definition: skey.h:121

SK_ISNULL

#define SK_ISNULL

Definition: skey.h:115

GetTransactionSnapshot

Snapshot GetTransactionSnapshot(void)

Definition: snapmgr.c:271

UnregisterSnapshot

void UnregisterSnapshot(Snapshot snapshot)

Definition: snapmgr.c:864

RegisterSnapshot

Snapshot RegisterSnapshot(Snapshot snapshot)

Definition: snapmgr.c:822

SnapshotAny

#define SnapshotAny

Definition: snapmgr.h:33

IsMVCCSnapshot

#define IsMVCCSnapshot(snapshot)

Definition: snapmgr.h:55

SpinLockInit

#define SpinLockInit(lock)

Definition: spin.h:57

SpinLockRelease

#define SpinLockRelease(lock)

Definition: spin.h:61

SpinLockAcquire

#define SpinLockAcquire(lock)

Definition: spin.h:59

relation_close

void relation_close(Relation relation, LOCKMODE lockmode)

Definition: relation.c:205

MyProc

PGPROC * MyProc

Definition: proc.c:66

BrinBuildState

Definition: brin.c:156

BrinBuildState::bs_maxRangeStart

BlockNumber bs_maxRangeStart

Definition: brin.c:163

BrinBuildState::bs_emptyTupleLen

Size bs_emptyTupleLen

Definition: brin.c:169

BrinBuildState::bs_context

MemoryContext bs_context

Definition: brin.c:170

BrinBuildState::bs_dtuple

BrinMemTuple * bs_dtuple

Definition: brin.c:166

BrinBuildState::bs_irel

Relation bs_irel

Definition: brin.c:157

BrinBuildState::bs_pagesPerRange

BlockNumber bs_pagesPerRange

Definition: brin.c:161

BrinBuildState::bs_numtuples

double bs_numtuples

Definition: brin.c:158

BrinBuildState::bs_currentInsertBuf

Buffer bs_currentInsertBuf

Definition: brin.c:160

BrinBuildState::bs_rmAccess

BrinRevmap * bs_rmAccess

Definition: brin.c:164

BrinBuildState::bs_sortstate

Tuplesortstate * bs_sortstate

Definition: brin.c:185

BrinBuildState::bs_leader

BrinLeader * bs_leader

Definition: brin.c:177

BrinBuildState::bs_worker_id

int bs_worker_id

Definition: brin.c:178

BrinBuildState::bs_currRangeStart

BlockNumber bs_currRangeStart

Definition: brin.c:162

BrinBuildState::bs_reltuples

double bs_reltuples

Definition: brin.c:159

BrinBuildState::bs_bdesc

BrinDesc * bs_bdesc

Definition: brin.c:165

BrinBuildState::bs_emptyTuple

BrinTuple * bs_emptyTuple

Definition: brin.c:168

BrinDesc

Definition: brin_internal.h:45

BrinDesc::bd_totalstored

int bd_totalstored

Definition: brin_internal.h:59

BrinDesc::bd_tupdesc

TupleDesc bd_tupdesc

Definition: brin_internal.h:53

BrinDesc::bd_info

BrinOpcInfo * bd_info[FLEXIBLE_ARRAY_MEMBER]

Definition: brin_internal.h:62

BrinDesc::bd_index

Relation bd_index

Definition: brin_internal.h:50

BrinDesc::bd_context

MemoryContext bd_context

Definition: brin_internal.h:47

BrinDesc::bd_disktdesc

TupleDesc bd_disktdesc

Definition: brin_internal.h:56

BrinInsertState

Definition: brin.c:193

BrinInsertState::bis_desc

BrinDesc * bis_desc

Definition: brin.c:195

BrinInsertState::bis_rmAccess

BrinRevmap * bis_rmAccess

Definition: brin.c:194

BrinInsertState::bis_pages_per_range

BlockNumber bis_pages_per_range

Definition: brin.c:196

BrinLeader

Definition: brin.c:123

BrinLeader::nparticipanttuplesorts

int nparticipanttuplesorts

Definition: brin.c:133

BrinLeader::walusage

WalUsage * walusage

Definition: brin.c:147

BrinLeader::brinshared

BrinShared * brinshared

Definition: brin.c:144

BrinLeader::bufferusage

BufferUsage * bufferusage

Definition: brin.c:148

BrinLeader::snapshot

Snapshot snapshot

Definition: brin.c:146

BrinLeader::sharedsort

Sharedsort * sharedsort

Definition: brin.c:145

BrinLeader::pcxt

ParallelContext * pcxt

Definition: brin.c:125

BrinMemTuple

Definition: brin_tuple.h:45

BrinMemTuple::bt_columns

BrinValues bt_columns[FLEXIBLE_ARRAY_MEMBER]

Definition: brin_tuple.h:55

BrinMemTuple::bt_blkno

BlockNumber bt_blkno

Definition: brin_tuple.h:48

BrinMemTuple::bt_placeholder

bool bt_placeholder

Definition: brin_tuple.h:46

BrinMemTuple::bt_empty_range

bool bt_empty_range

Definition: brin_tuple.h:47

BrinMetaPageData

Definition: brin_page.h:65

BrinMetaPageData::lastRevmapPage

BlockNumber lastRevmapPage

Definition: brin_page.h:69

BrinMetaPageData::pagesPerRange

BlockNumber pagesPerRange

Definition: brin_page.h:68

BrinOpaque

Definition: brin.c:203

BrinOpaque::bo_pagesPerRange

BlockNumber bo_pagesPerRange

Definition: brin.c:204

BrinOpaque::bo_bdesc

BrinDesc * bo_bdesc

Definition: brin.c:206

BrinOpaque::bo_rmAccess

BrinRevmap * bo_rmAccess

Definition: brin.c:205

BrinOpcInfo

Definition: brin_internal.h:26

BrinOpcInfo::oi_typcache

TypeCacheEntry * oi_typcache[FLEXIBLE_ARRAY_MEMBER]

Definition: brin_internal.h:37

BrinOpcInfo::oi_nstored

uint16 oi_nstored

Definition: brin_internal.h:28

BrinOpcInfo::oi_regular_nulls

bool oi_regular_nulls

Definition: brin_internal.h:31

BrinOptions

Definition: brin.h:23

BrinRevmap

Definition: brin_revmap.c:47

BrinShared

Definition: brin.c:58

BrinShared::mutex

slock_t mutex

Definition: brin.c:87

BrinShared::scantuplesortstates

int scantuplesortstates

Definition: brin.c:68

BrinShared::nparticipantsdone

int nparticipantsdone

Definition: brin.c:99

BrinShared::heaprelid

Oid heaprelid

Definition: brin.c:64

BrinShared::pagesPerRange

BlockNumber pagesPerRange

Definition: brin.c:67

BrinShared::workersdonecv

ConditionVariable workersdonecv

Definition: brin.c:79

BrinShared::indexrelid

Oid indexrelid

Definition: brin.c:65

BrinShared::isconcurrent

bool isconcurrent

Definition: brin.c:66

BrinShared::indtuples

double indtuples

Definition: brin.c:101

BrinShared::queryid

int64 queryid

Definition: brin.c:71

BrinShared::reltuples

double reltuples

Definition: brin.c:100

BrinStatsData

Definition: brin.h:34

BrinStatsData::revmapNumPages

BlockNumber revmapNumPages

Definition: brin.h:36

BrinStatsData::pagesPerRange

BlockNumber pagesPerRange

Definition: brin.h:35

BrinTuple

Definition: brin_tuple.h:64

BrinTuple::bt_blkno

BlockNumber bt_blkno

Definition: brin_tuple.h:66

BrinValues

Definition: brin_tuple.h:30

BrinValues::bv_hasnulls

bool bv_hasnulls

Definition: brin_tuple.h:32

BrinValues::bv_values

Datum * bv_values

Definition: brin_tuple.h:34

BrinValues::bv_attno

AttrNumber bv_attno

Definition: brin_tuple.h:31

BrinValues::bv_allnulls

bool bv_allnulls

Definition: brin_tuple.h:33

BufferAccessStrategyData

Definition: freelist.c:65

BufferUsage

Definition: instrument.h:25

ConditionVariable

Definition: condition_variable.h:29

FmgrInfo

Definition: fmgr.h:57

IndexAmRoutine

Definition: amapi.h:233

IndexAmRoutine::ambuildphasename

ambuildphasename_function ambuildphasename

Definition: amapi.h:306

IndexAmRoutine::ambuildempty

ambuildempty_function ambuildempty

Definition: amapi.h:296

IndexAmRoutine::amvacuumcleanup

amvacuumcleanup_function amvacuumcleanup

Definition: amapi.h:300

IndexAmRoutine::amclusterable

bool amclusterable

Definition: amapi.h:270

IndexAmRoutine::amoptions

amoptions_function amoptions

Definition: amapi.h:304

IndexAmRoutine::amestimateparallelscan

amestimateparallelscan_function amestimateparallelscan

Definition: amapi.h:318

IndexAmRoutine::amrestrpos

amrestrpos_function amrestrpos

Definition: amapi.h:315

IndexAmRoutine::aminsert

aminsert_function aminsert

Definition: amapi.h:297

IndexAmRoutine::amendscan

amendscan_function amendscan

Definition: amapi.h:313

IndexAmRoutine::amtranslatestrategy

amtranslate_strategy_function amtranslatestrategy

Definition: amapi.h:323

IndexAmRoutine::amoptsprocnum

uint16 amoptsprocnum

Definition: amapi.h:244

IndexAmRoutine::amparallelrescan

amparallelrescan_function amparallelrescan

Definition: amapi.h:320

IndexAmRoutine::amkeytype

Oid amkeytype

Definition: amapi.h:286

IndexAmRoutine::amconsistentordering

bool amconsistentordering

Definition: amapi.h:254

IndexAmRoutine::ampredlocks

bool ampredlocks

Definition: amapi.h:272

IndexAmRoutine::amsupport

uint16 amsupport

Definition: amapi.h:242

IndexAmRoutine::amtranslatecmptype

amtranslate_cmptype_function amtranslatecmptype

Definition: amapi.h:324

IndexAmRoutine::amcostestimate

amcostestimate_function amcostestimate

Definition: amapi.h:302

IndexAmRoutine::amcanorderbyop

bool amcanorderbyop

Definition: amapi.h:248

IndexAmRoutine::amadjustmembers

amadjustmembers_function amadjustmembers

Definition: amapi.h:308

IndexAmRoutine::ambuild

ambuild_function ambuild

Definition: amapi.h:295

IndexAmRoutine::amstorage

bool amstorage

Definition: amapi.h:268

IndexAmRoutine::amstrategies

uint16 amstrategies

Definition: amapi.h:240

IndexAmRoutine::amoptionalkey

bool amoptionalkey

Definition: amapi.h:262

IndexAmRoutine::amgettuple

amgettuple_function amgettuple

Definition: amapi.h:311

IndexAmRoutine::amcanreturn

amcanreturn_function amcanreturn

Definition: amapi.h:301

IndexAmRoutine::amcanunique

bool amcanunique

Definition: amapi.h:258

IndexAmRoutine::amgetbitmap

amgetbitmap_function amgetbitmap

Definition: amapi.h:312

IndexAmRoutine::amproperty

amproperty_function amproperty

Definition: amapi.h:305

IndexAmRoutine::ambulkdelete

ambulkdelete_function ambulkdelete

Definition: amapi.h:299

IndexAmRoutine::amsearcharray

bool amsearcharray

Definition: amapi.h:264

IndexAmRoutine::amsummarizing

bool amsummarizing

Definition: amapi.h:282

IndexAmRoutine::amvalidate

amvalidate_function amvalidate

Definition: amapi.h:307

IndexAmRoutine::ammarkpos

ammarkpos_function ammarkpos

Definition: amapi.h:314

IndexAmRoutine::amcanmulticol

bool amcanmulticol

Definition: amapi.h:260

IndexAmRoutine::amusemaintenanceworkmem

bool amusemaintenanceworkmem

Definition: amapi.h:280

IndexAmRoutine::ambeginscan

ambeginscan_function ambeginscan

Definition: amapi.h:309

IndexAmRoutine::amcanparallel

bool amcanparallel

Definition: amapi.h:274

IndexAmRoutine::amrescan

amrescan_function amrescan

Definition: amapi.h:310

IndexAmRoutine::amcanorder

bool amcanorder

Definition: amapi.h:246

IndexAmRoutine::amcanbuildparallel

bool amcanbuildparallel

Definition: amapi.h:276

IndexAmRoutine::aminitparallelscan

aminitparallelscan_function aminitparallelscan

Definition: amapi.h:319

IndexAmRoutine::amparallelvacuumoptions

uint8 amparallelvacuumoptions

Definition: amapi.h:284

IndexAmRoutine::aminsertcleanup

aminsertcleanup_function aminsertcleanup

Definition: amapi.h:298

IndexAmRoutine::amcanbackward

bool amcanbackward

Definition: amapi.h:256

IndexAmRoutine::amgettreeheight

amgettreeheight_function amgettreeheight

Definition: amapi.h:303

IndexAmRoutine::amcaninclude

bool amcaninclude

Definition: amapi.h:278

IndexAmRoutine::amsearchnulls

bool amsearchnulls

Definition: amapi.h:266

IndexAmRoutine::amconsistentequality

bool amconsistentequality

Definition: amapi.h:252

IndexAmRoutine::amcanhash

bool amcanhash

Definition: amapi.h:250

IndexBuildResult

Definition: genam.h:58

IndexBuildResult::heap_tuples

double heap_tuples

Definition: genam.h:59

IndexBuildResult::index_tuples

double index_tuples

Definition: genam.h:60

IndexBulkDeleteResult

Definition: genam.h:103

IndexBulkDeleteResult::num_pages

BlockNumber num_pages

Definition: genam.h:104

IndexBulkDeleteResult::num_index_tuples

double num_index_tuples

Definition: genam.h:106

IndexInfo

Definition: execnodes.h:163

IndexInfo::ii_AmCache

void * ii_AmCache

Definition: execnodes.h:223

IndexInfo::ii_ParallelWorkers

int ii_ParallelWorkers

Definition: execnodes.h:218

IndexInfo::ii_Concurrent

bool ii_Concurrent

Definition: execnodes.h:210

IndexInfo::ii_Context

MemoryContext ii_Context

Definition: execnodes.h:226

IndexScanDescData

Definition: relscan.h:134

IndexScanDescData::keyData

struct ScanKeyData * keyData

Definition: relscan.h:141

IndexScanDescData::numberOfKeys

int numberOfKeys

Definition: relscan.h:139

IndexScanDescData::instrument

struct IndexScanInstrumentation * instrument

Definition: relscan.h:159

IndexScanDescData::indexRelation

Relation indexRelation

Definition: relscan.h:137

IndexScanDescData::opaque

void * opaque

Definition: relscan.h:153

IndexScanInstrumentation::nsearches

uint64 nsearches

Definition: genam.h:42

IndexVacuumInfo

Definition: genam.h:72

IndexVacuumInfo::index

Relation index

Definition: genam.h:73

IndexVacuumInfo::analyze_only

bool analyze_only

Definition: genam.h:75

IndexVacuumInfo::strategy

BufferAccessStrategy strategy

Definition: genam.h:80

ItemIdData

Definition: itemid.h:26

ItemPointerData

Definition: itemptr.h:37

MemoryContextData

Definition: memnodes.h:118

PGPROC::statusFlags

uint8 statusFlags

Definition: proc.h:259

ParallelContext

Definition: parallel.h:32

ParallelContext::seg

dsm_segment * seg

Definition: parallel.h:42

ParallelContext::estimator

shm_toc_estimator estimator

Definition: parallel.h:41

ParallelContext::toc

shm_toc * toc

Definition: parallel.h:44

ParallelContext::nworkers_launched

int nworkers_launched

Definition: parallel.h:37

ParallelContext::nworkers

int nworkers

Definition: parallel.h:35

RelationData

Definition: rel.h:56

RelationData::rd_index

Form_pg_index rd_index

Definition: rel.h:192

RelationData::rd_indcollation

Oid * rd_indcollation

Definition: rel.h:217

RelationData::rd_rel

Form_pg_class rd_rel

Definition: rel.h:111

ScanKeyData

Definition: skey.h:65

ScanKeyData::sk_collation

Oid sk_collation

Definition: skey.h:70

Sharedsort

Definition: tuplesort.c:344

SnapshotData

Definition: snapshot.h:139

SortCoordinateData

Definition: tuplesort.h:47

SortCoordinateData::sharedsort

Sharedsort * sharedsort

Definition: tuplesort.h:59

SortCoordinateData::isWorker

bool isWorker

Definition: tuplesort.h:49

SortCoordinateData::nParticipants

int nParticipants

Definition: tuplesort.h:56

TIDBitmap

Definition: tidbitmap.c:142

TableScanDescData

Definition: relscan.h:34

TupleDescData

Definition: tupdesc.h:136

TupleDescData::natts

int natts

Definition: tupdesc.h:137

TupleDescData::tdrefcount

int tdrefcount

Definition: tupdesc.h:140

Tuplesortstate

Definition: tuplesort.c:186

TypeCacheEntry::typbyval

bool typbyval

Definition: typcache.h:40

TypeCacheEntry::typlen

int16 typlen

Definition: typcache.h:39

WalUsage

Definition: instrument.h:52

dsm_segment

Definition: dsm.c:67

index

Definition: type.h:96

relopt_parse_elt

Definition: reloptions.h:151

shm_toc

Definition: shm_toc.c:27

state

Definition: regguts.h:323

varlena

Definition: c.h:692

xl_brin_createidx

Definition: brin_xlog.h:51

xl_brin_createidx::version

uint16 version

Definition: brin_xlog.h:53

xl_brin_createidx::pagesPerRange

BlockNumber pagesPerRange

Definition: brin_xlog.h:52

table_close

void table_close(Relation relation, LOCKMODE lockmode)

Definition: table.c:126

table_open

Relation table_open(Oid relationId, LOCKMODE lockmode)

Definition: table.c:40

table.h

table_beginscan_parallel

TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)

Definition: tableam.c:166

table_parallelscan_estimate

Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)

Definition: tableam.c:131

table_parallelscan_initialize

void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)

Definition: tableam.c:146

tableam.h

table_index_build_range_scan

static double table_index_build_range_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool anyvisible, bool progress, BlockNumber start_blockno, BlockNumber numblocks, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)

Definition: tableam.h:1777

table_index_build_scan

static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)

Definition: tableam.h:1744

tcopprot.h

callback

static void callback(struct sockaddr *addr, struct sockaddr *mask, void *unused)

Definition: test_ifaddrs.c:46

tbm_add_page

void tbm_add_page(TIDBitmap *tbm, BlockNumber pageno)

Definition: tidbitmap.c:433

TupleDescAttr

static FormData_pg_attribute * TupleDescAttr(TupleDesc tupdesc, int i)

Definition: tupdesc.h:160

tuplesort_performsort