1/*-------------------------------------------------------------------------
4 * Build a btree from sorted input by loading leaf pages sequentially.
8 * We use tuplesort.c to sort the given index tuples into order.
9 * Then we scan the index tuples in order and build the btree pages
10 * for each level. We load source tuples into leaf-level pages.
11 * Whenever we fill a page at one level, we add a link to it to its
12 * parent level (starting a new parent level if necessary). When
13 * done, we write out each final page on each level, adding it to
14 * its parent level. When we have only one page on a level, it must be
15 * the root -- it can be attached to the btree metapage and we are done.
17 * It is not wise to pack the pages entirely full, since then *any*
18 * insertion would cause a split (and not only of the leaf page; the need
19 * for a split would cascade right up the tree). The steady-state load
20 * factor for btrees is usually estimated at 70%. We choose to pack leaf
21 * pages to the user-controllable fill factor (default 90%) while upper pages
22 * are always packed to 70%. This gives us reasonable density (there aren't
23 * many upper pages if the keys are reasonable-size) without risking a lot of
24 * cascading splits during early insertions.
26 * We use the bulk smgr loading facility to bypass the buffer cache and
27 * WAL-log the pages efficiently.
29 * This code isn't concerned about the FSM at all. The caller is responsible
30 * for initializing that.
32 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
33 * Portions Copyright (c) 1994, Regents of the University of California
36 * src/backend/access/nbtree/nbtsort.c
38 *-------------------------------------------------------------------------
61/* Magic numbers for parallel state sharing */
62 #define PARALLEL_KEY_BTREE_SHARED UINT64CONST(0xA000000000000001)
63 #define PARALLEL_KEY_TUPLESORT UINT64CONST(0xA000000000000002)
64 #define PARALLEL_KEY_TUPLESORT_SPOOL2 UINT64CONST(0xA000000000000003)
65 #define PARALLEL_KEY_QUERY_TEXT UINT64CONST(0xA000000000000004)
66 #define PARALLEL_KEY_WAL_USAGE UINT64CONST(0xA000000000000005)
67 #define PARALLEL_KEY_BUFFER_USAGE UINT64CONST(0xA000000000000006)
70 * DISABLE_LEADER_PARTICIPATION disables the leader's participation in
71 * parallel index builds. This may be useful as a debugging aid.
72#undef DISABLE_LEADER_PARTICIPATION
76 * Status record for spooling/sorting phase. (Note we may have two of
77 * these due to the special requirements for uniqueness-checking with
90 * Status for index builds performed in parallel. This is allocated in a
91 * dynamic shared memory segment. Note that there is a separate tuplesort TOC
92 * entry, private to tuplesort.c but allocated by this module on its behalf.
97 * These fields are not modified during the sort. They primarily exist
98 * for the benefit of worker processes that need to create BTSpool state
99 * corresponding to that used by the leader.
108 /* Query ID, for report in worker processes */
112 * workersdonecv is used to monitor the progress of workers. All parallel
113 * participants must indicate that they are done before leader can use
114 * mutable state that workers maintain during scan (and before leader can
115 * proceed to tuplesort_performsort()).
120 * mutex protects all fields before heapdesc.
122 * These fields contain status information of interest to B-Tree index
123 * builds that must work just the same when an index is built in parallel.
128 * Mutable state that is maintained by workers, and reported back to
129 * leader at end of parallel scan.
131 * nparticipantsdone is number of worker processes finished.
133 * reltuples is the total number of input heap tuples.
135 * havedead indicates if RECENTLY_DEAD tuples were encountered during
138 * indtuples is the total number of tuples that made it into the index.
140 * brokenhotchain indicates if any worker detected a broken HOT chain
150 * ParallelTableScanDescData data follows. Can't directly embed here, as
151 * implementations of the parallel table scan desc interface might need
152 * stronger alignment.
157 * Return pointer to a BTShared's parallel table scan.
159 * c.f. shm_toc_allocate as to why BUFFERALIGN is used, rather than just
162 #define ParallelTableScanFromBTShared(shared) \
163 (ParallelTableScanDesc) ((char *) (shared) + BUFFERALIGN(sizeof(BTShared)))
166 * Status for leader in parallel index build.
170 /* parallel context itself */
174 * nparticipanttuplesorts is the exact number of worker processes
175 * successfully launched, plus one leader process if it participates as a
176 * worker (only DISABLE_LEADER_PARTICIPATION builds avoid leader
177 * participating as a worker).
182 * Leader process convenience pointers to shared state (leader avoids TOC
185 * btshared is the shared state for entire build. sharedsort is the
186 * shared, tuplesort-managed state passed to each process tuplesort.
187 * sharedsort2 is the corresponding btspool2 shared state, used only when
188 * building unique indexes. snapshot is the snapshot used by the scan iff
189 * an MVCC snapshot is required.
200 * Working state for btbuild and its callback.
202 * When parallel CREATE INDEX is used, there is a BTBuildState for each
214 * spool2 is needed only when the index is a unique index. Dead tuples are
215 * put into spool2 instead of spool in order to avoid uniqueness check.
221 * btleader is only present when a parallel index build is performed, and
222 * only in the leader process. (Actually, only the leader has a
223 * BTBuildState. Workers have their own spool and spool2, though.)
229 * Status record for a btree page being built. We have one of these
230 * for each active tree level.
245 * Overall status record for index writing phase.
264 bool *isnull,
bool tupleIsAlive,
void *
state);
270 bool newfirstdataitem);
284 bool *brokenhotchain);
293 * btbuild() -- build a new btree index.
302#ifdef BTREE_BUILD_STATS
305#endif /* BTREE_BUILD_STATS */
310 buildstate.
heap = heap;
311 buildstate.
spool = NULL;
317 * We expect to be called exactly once for any index relation. If that's
318 * not the case, big trouble's what we have.
321 elog(
ERROR,
"index \"%s\" already contains data",
327 * Finish the build by (1) completing the sort of the spool file, (2)
328 * inserting the sorted tuples into btree pages and (3) building the upper
329 * levels. Finally, it may also be necessary to end use of parallelism.
343#ifdef BTREE_BUILD_STATS
349#endif /* BTREE_BUILD_STATS */
355 * Create and initialize one or two spool structures, and save them in caller's
356 * buildstate argument. May also fill-in fields within indexInfo used by index
359 * Scans the heap, possibly in parallel, filling spools with IndexTuples. This
360 * routine encapsulates all aspects of managing parallelism. Caller need only
361 * call _bt_end_parallel() in parallel case after it is done with spool/spool2.
363 * Returns the total number of heap tuples scanned.
371 double reltuples = 0;
374 * We size the sort area as maintenance_work_mem rather than work_mem to
375 * speed index creation. This should be OK since a single backend can't
376 * run multiple index creations in parallel (see also: notes on
377 * parallelism and maintenance_work_mem below).
379 btspool->
heap = heap;
384 /* Save as primary spool */
385 buildstate->
spool = btspool;
387 /* Report table scan phase started */
391 /* Attempt to launch parallel worker scan when required */
397 * If parallel build requested and at least one worker process was
398 * successfully launched, set up coordination state
410 * Begin serial/leader tuplesort.
412 * In cases where parallelism is involved, the leader receives the same
413 * share of maintenance_work_mem as a serial sort (it is generally treated
414 * in the same way as a serial sort once we return). Parallel worker
415 * Tuplesortstates will have received only a fraction of
416 * maintenance_work_mem, though.
418 * We rely on the lifetime of the Leader Tuplesortstate almost not
419 * overlapping with any worker Tuplesortstate's lifetime. There may be
420 * some small overlap, but that's okay because we rely on leader
421 * Tuplesortstate only allocating a small, fixed amount of memory here.
422 * When its tuplesort_performsort() is called (by our caller), and
423 * significant amounts of memory are likely to be used, all workers must
424 * have already freed almost all memory held by their Tuplesortstates
425 * (they are about to go away completely, too). The overall effect is
426 * that maintenance_work_mem always represents an absolute high watermark
427 * on the amount of memory used by a CREATE INDEX operation, regardless of
428 * the use of parallelism or any other factor.
437 * If building a unique index, put dead tuples in a second spool to keep
438 * them out of the uniqueness check. We expect that the second spool (for
439 * dead tuples) won't get very full, so we give it only work_mem.
446 /* Initialize secondary spool */
447 btspool2->
heap = heap;
450 /* Save as secondary spool */
451 buildstate->
spool2 = btspool2;
456 * Set up non-private state that is passed to
457 * tuplesort_begin_index_btree() about the basic high level
458 * coordination of a parallel sort.
468 * We expect that the second one (for dead tuples) won't get very
469 * full, so we give it only work_mem
476 /* Fill spool using either serial or parallel heap scan */
486 * Set the progress target for the next phase. Reset the block number
487 * values set by table_index_build_scan
490 const int progress_index[] = {
495 const int64 progress_vals[] = {
503 /* okay, all heap tuples are spooled */
506 /* spool2 turns out to be unnecessary */
508 buildstate->
spool2 = NULL;
515 * clean up a spool structure and its substructures.
525 * spool an index entry into the sort file.
535 * given a spool loaded by successive calls to _bt_spool,
536 * create an entire btree.
543#ifdef BTREE_BUILD_STATS
546 ShowUsage(
"BTREE BUILD (Spool) STATISTICS");
549#endif /* BTREE_BUILD_STATS */
551 /* Execute the sort */
565 /* _bt_mkscankey() won't set allequalimage without metapage */
568 /* reserve the metapage */
573 _bt_load(&wstate, btspool, btspool2);
577 * Per-tuple callback for table_index_build_scan
590 * insert the index tuple into the appropriate spool file for subsequent
593 if (tupleIsAlive || buildstate->
spool2 == NULL)
597 /* dead tuples are put into spool2 */
606 * allocate workspace for a new, clean btree page, not linked to any siblings.
618 /* Zero the page and set up standard page header info */
621 /* Initialize BT opaque state */
628 /* Make the P_HIKEY line pointer appear allocated */
635 * emit a completed btree page, and release the working storage.
641 /* smgr_bulk_write took ownership of 'buf' */
645 * allocate and initialize a new BTPageState. the returned structure
646 * is suitable for immediate use by _bt_buildadd.
653 /* create initial page for level */
656 /* and assign it a page position */
659 state->btps_lowkey = NULL;
660 /* initialize lastoff so first item goes into P_FIRSTKEY */
662 state->btps_lastextra = 0;
663 state->btps_level = level;
664 /* set "full" threshold based on level. See notes at head of file. */
670 /* no parent level, yet */
671 state->btps_next = NULL;
677 * Slide the array of ItemIds from the page back one slot (from P_FIRSTKEY to
678 * P_HIKEY, overwriting P_HIKEY).
680 * _bt_blnewpage() makes the P_HIKEY line pointer appear allocated, but the
681 * rightmost page on its level is not supposed to get a high key. Now that
682 * it's clear that this page is a rightmost page, remove the unneeded empty
683 * P_HIKEY line pointer space.
706 * Add an item to a page being built.
708 * This is very similar to nbtinsert.c's _bt_pgaddtup(), but this variant
709 * raises an error directly.
711 * Note that our nbtsort.c caller does not know yet if the page will be
712 * rightmost. Offset P_FIRSTKEY is always assumed to be the first data key by
713 * caller. Page that turns out to be the rightmost on its level is fixed by
714 * calling _bt_slideleft().
721 bool newfirstdataitem)
725 if (newfirstdataitem)
736 elog(
ERROR,
"failed to add item to the index page");
740 * Add an item to a disk page from the sort output (or add a posting list
741 * item formed from the sort output).
743 * We must be careful to observe the page layout conventions of nbtsearch.c:
744 * - rightmost pages start data items at P_HIKEY instead of at P_FIRSTKEY.
745 * - on non-leaf pages, the key portion of the first item need not be
746 * stored, we should store only the link.
748 * A leaf page being built looks like:
750 * +----------------+---------------------------------+
751 * | PageHeaderData | linp0 linp1 linp2 ... |
752 * +-----------+----+---------------------------------+
754 * +-----------+--------------------------------------+
757 * +-------------+------------------------------------+
759 * +-------------+------------------+-----------------+
760 * | ... item3 item2 item1 | "special space" |
761 * +--------------------------------+-----------------+
763 * Contrast this with the diagram in bufpage.h; note the mismatch
764 * between linps and items. This is because we reserve linp0 as a
765 * placeholder for the pointer to the "high key" item; when we have
766 * filled up the page, we will set linp0 to point to itemN and clear
767 * linpN. On the other hand, if we find this is the last (rightmost)
768 * page, we leave the items alone and slide the linp array over. If
769 * the high key is to be truncated, offset 1 is deleted, and we insert
770 * the truncated high key at offset 1.
772 * 'last' pointer indicates the last offset added to the page.
774 * 'truncextra' is the size of the posting list in itup, if any. This
775 * information is stashed for the next call here, when we may benefit
776 * from considering the impact of truncating away the posting list on
777 * the page before deciding to finish the page off. Posting lists are
778 * often relatively large, so it is worth going to the trouble of
779 * accounting for the saving from truncating away the posting list of
780 * the tuple that becomes the high key (that may be the only way to
781 * get close to target free space on the page). Note that this is
782 * only used for the soft fillfactor-wise limit, not the critical hard
794 Size last_truncextra;
800 * This is a handy place to check for cancel interrupts during the btree
801 * load phase of index creation.
805 nbuf =
state->btps_buf;
807 nblkno =
state->btps_blkno;
808 last_off =
state->btps_lastoff;
809 last_truncextra =
state->btps_lastextra;
810 state->btps_lastextra = truncextra;
815 /* Leaf case has slightly different rules due to suffix truncation */
816 isleaf = (
state->btps_level == 0);
819 * Check whether the new item can fit on a btree page on current level at
822 * Every newly built index will treat heap TID as part of the keyspace,
823 * which imposes the requirement that new high keys must occasionally have
824 * a heap TID appended within _bt_truncate(). That may leave a new pivot
825 * tuple one or two MAXALIGN() quantums larger than the original
826 * firstright tuple it's derived from. v4 deals with the problem by
827 * decreasing the limit on the size of tuples inserted on the leaf level
828 * by the same small amount. Enforce the new v4+ limit on the leaf level,
829 * and the old limit on internal levels, since pivot tuples may need to
830 * make use of the reserved space. This should never fail on internal
838 * Check to see if current page will fit new item, with space left over to
839 * append a heap TID during suffix truncation when page is a leaf page.
841 * It is guaranteed that we can fit at least 2 non-pivot tuples plus a
842 * high key with heap TID when finishing off a leaf page, since we rely on
843 * _bt_check_third_page() rejecting oversized non-pivot tuples. On
844 * internal pages we can always fit 3 pivot tuples with larger internal
845 * page tuple limit (includes page high key).
847 * Most of the time, a page is only "full" in the sense that the soft
848 * fillfactor-wise limit has been exceeded. However, we must always leave
849 * at least two items plus a high key on each page before starting a new
850 * page. Disregard fillfactor and insert on "full" current page if we
851 * don't have the minimum number of items yet. (Note that we deliberately
852 * assume that suffix truncation neither enlarges nor shrinks new high key
853 * when applying soft limit, except when last tuple has a posting list.)
855 Assert(last_truncextra == 0 || isleaf);
857 (pgspc + last_truncextra < state->btps_full && last_off >
P_FIRSTKEY))
860 * Finish off the page and write it out.
869 /* Create new page of same level */
873 /* and assign it a page position */
877 * We copy the last item on the page into the new page, and then
878 * rearrange the old page so that the 'last item' becomes its high key
879 * rather than a true data item. There had better be at least two
880 * items on the page already, else the page would be empty of useful
890 * Move 'last' into the high key position on opage. _bt_blnewpage()
891 * allocated empty space for a line pointer when opage was first
892 * created, so this is a matter of rearranging already-allocated space
893 * on page, and initializing high key line pointer. (Actually, leaf
894 * pages must also swap oitup with a truncated version of oitup, which
895 * is sometimes larger than oitup, though never by more than the space
896 * needed to append a heap TID.)
909 * Truncate away any unneeded attributes from high key on leaf
910 * level. This is only done at the leaf level because downlinks
911 * in internal pages are either negative infinity items, or get
912 * their contents from copying from one level down. See also:
915 * We don't try to bias our choice of split point to make it more
916 * likely that _bt_truncate() can truncate away more attributes,
917 * whereas the split point used within _bt_split() is chosen much
918 * more delicately. Even still, the lastleft and firstright
919 * tuples passed to _bt_truncate() here are at least not fully
920 * equal to each other when deduplication is used, unless there is
921 * a large group of duplicates (also, unique index builds usually
922 * have few or no spool2 duplicates). When the split point is
923 * between two unequal tuples, _bt_truncate() will avoid including
924 * a heap TID in the new high key, which is the most important
925 * benefit of suffix truncation.
927 * Overwrite the old item with new truncated high key directly.
928 * oitup is already located at the physical beginning of tuple
929 * space, so this should directly reuse the existing tuple space.
939 elog(
ERROR,
"failed to add high key to the index page");
942 /* oitup should continue to point to the page's high key */
948 * Link the old page into its parent, using its low key. If we don't
949 * have a parent, we have to create one; this adds a new btree level.
951 if (
state->btps_next == NULL)
965 * Save a copy of the high key from the old page. It is also the low
966 * key for the new page.
971 * Set the sibling links for both pages.
983 * Write out the old page. _bt_blwritepage takes ownership of the
989 * Reset last_off to point to new page
995 * By here, either original page is still the current page, or a new page
996 * was created that became the current page. Either way, the current page
997 * definitely has space for new item.
999 * If the new item is the first for its page, it must also be the first
1000 * item on its entire level. On later same-level pages, a low key for a
1001 * page will be copied from the prior page in the code above. Generate a
1002 * minus infinity low key here instead.
1013 * Add the new item into the current page.
1019 state->btps_buf = nbuf;
1020 state->btps_blkno = nblkno;
1021 state->btps_lastoff = last_off;
1025 * Finalize pending posting list tuple, and add it to the index. Final tuple
1026 * is based on saved base tuple, and saved list of heap TIDs.
1028 * This is almost like _bt_dedup_finish_pending(), but it adds a new tuple
1029 * using _bt_buildadd().
1044 /* form a tuple with a posting list */
1048 /* Calculate posting list overhead */
1053 pfree(postingtuple);
1063 * Finish writing out the completed btree.
1074 * Each iteration of this loop completes one more level of the tree.
1085 * We have to link the last page on this level to somewhere.
1087 * If we're at the top, it's the root, so attach it to the metapage.
1088 * Otherwise, add an entry for it to its parent using its low key.
1089 * This may cause the last page of the parent level to split, but
1090 * that's not a problem -- we haven't gotten to it yet.
1113 * This is the rightmost page, so the ItemId array needs to be slid
1114 * back one slot. Then we can dump out the page.
1118 s->
btps_buf = NULL;
/* writepage took ownership of the buffer */
1122 * As the last step in the process, construct the metapage and make it
1123 * point to the new root (unless we had no data at all, in which case it's
1124 * set to point to "P_NONE"). This changes the index to the "valid" state
1125 * by filling in a valid magic number in the metapage.
1134 * Read tuples in correct sort order from tuplesort, and load them into
1141 bool merge = (btspool2 != NULL);
1149 int64 tuples_done = 0;
1160 * Another BTSpool for dead tuples exists. Now we have to merge
1161 * btspool and btspool2.
1164 /* the preparation of merge */
1168 /* Prepare SortSupport data for each column */
1171 for (
i = 0;
i < keysz;
i++)
1182 /* Abbreviation is not supported here */
1194 load1 =
true;
/* load BTSpool next ? */
1200 else if (itup != NULL)
1204 for (
i = 1;
i <= keysz;
i++)
1212 entry = sortKeys +
i - 1;
1217 attrDatum2, isNull2,
1229 * If key values are equal, we sort on ItemPointer. This is
1230 * required for btree indexes, since heap TID is treated as an
1231 * implicit last key attribute in order to ensure that all
1232 * keys in the index are physically unique.
1245 /* When we see first tuple, create first index page */
1260 /* Report progress */
1266 else if (deduplicate)
1268 /* merge is unnecessary, deduplicate into posting lists */
1275 /* Metadata about base tuple of current pending posting list */
1276 dstate->
base = NULL;
1279 /* Metadata about current pending posting list TIDs */
1280 dstate->
htids = NULL;
1289 /* When we see first tuple, create first index page */
1295 * Limit size of posting list tuples to 1/10 space we want to
1296 * leave behind on the page, plus space for final item's line
1297 * pointer. This is equal to the space that we'd like to
1298 * leave behind on each leaf page when fillfactor is 90,
1299 * allowing us to get close to fillfactor% space utilization
1300 * when there happen to be a great many duplicates. (This
1301 * makes higher leaf fillfactor settings ineffective when
1302 * building indexes that have many duplicates, but packing
1303 * leaf pages full with few very large tuples doesn't seem
1304 * like a useful goal.)
1312 /* start new pending posting list with itup copy */
1321 * Tuple is equal to base tuple of pending posting list. Heap
1322 * TID from itup has been saved in state.
1328 * Tuple is not equal to pending posting list tuple, or
1329 * _bt_dedup_save_htid() opted to not merge current item into
1330 * pending posting list.
1335 /* start new pending posting list with itup copy */
1340 /* Report progress */
1348 * Handle the last item (there must be a last item when the
1349 * tuplesort returned one or more tuples)
1360 /* merging and deduplication are both unnecessary */
1364 /* When we see first tuple, create first index page */
1370 /* Report progress */
1376 /* Close down final pages and write the metapage */
1382 * Create parallel context, and launch workers for leader.
1384 * buildstate argument should be initialized (with the exception of the
1385 * tuplesort state in spools, which may later be created based on shared
1386 * state initially set up here).
1388 * isconcurrent indicates if operation is CREATE INDEX CONCURRENTLY.
1390 * request is the target number of parallel worker processes to launch.
1392 * Sets buildstate's BTLeader, which caller must use to shut down parallel
1393 * mode by passing it to _bt_end_parallel() at the very end of its index
1394 * build. If not even a single worker process can be launched, this is
1395 * never set, and caller should proceed with a serial index build.
1401 int scantuplesortstates;
1412 bool leaderparticipates =
true;
1415#ifdef DISABLE_LEADER_PARTICIPATION
1416 leaderparticipates =
false;
1420 * Enter parallel mode, and create context for parallel build of btree
1428 scantuplesortstates = leaderparticipates ? request + 1 : request;
1431 * Prepare for scan of the base relation. In a normal index build, we use
1432 * SnapshotAny because we must retrieve all tuples and do our own time
1433 * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1434 * concurrent build, we take a regular MVCC snapshot and index whatever's
1435 * live according to that.
1443 * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and
1444 * PARALLEL_KEY_TUPLESORT tuplesort workspace
1452 * Unique case requires a second spool, and so we may have to account for
1453 * another shared workspace for that -- PARALLEL_KEY_TUPLESORT_SPOOL2
1464 * Estimate space for WalUsage and BufferUsage -- PARALLEL_KEY_WAL_USAGE
1465 * and PARALLEL_KEY_BUFFER_USAGE.
1467 * If there are no extensions loaded that care, we could skip this. We
1468 * have no way of knowing whether anyone's looking at pgWalUsage or
1469 * pgBufferUsage, so do it unconditionally.
1478 /* Finally, estimate PARALLEL_KEY_QUERY_TEXT space */
1486 querylen = 0;
/* keep compiler quiet */
1488 /* Everyone's had a chance to ask for space, so now create the DSM */
1491 /* If no DSM segment was available, back out (do serial build) */
1492 if (pcxt->
seg == NULL)
1501 /* Store shared build state, for which we reserved space */
1503 /* Initialize immutable state */
1513 /* Initialize mutable state */
1524 * Store shared tuplesort-private state, for which we reserved space.
1525 * Then, initialize opaque state using tuplesort routine.
1534 /* Unique case requires a second spool, and associated shared state */
1540 * Store additional shared tuplesort-private state, for which we
1541 * reserved space. Then, initialize opaque state using tuplesort
1551 /* Store query string for workers */
1562 * Allocate space for each worker's WalUsage and BufferUsage; no need to
1572 /* Launch workers, saving status for leader/caller */
1574 btleader->
pcxt = pcxt;
1576 if (leaderparticipates)
1585 /* If no workers were successfully launched, back out (do serial build) */
1592 /* Save leader state now that it's clear build will be parallel */
1595 /* Join heap scan ourselves */
1596 if (leaderparticipates)
1600 * Caller needs to wait for all launched workers when we return. Make
1601 * sure that the failure-to-start case will not hang forever.
1607 * Shut down workers, destroy parallel context, and end parallel mode.
1614 /* Shutdown worker processes */
1618 * Next, accumulate WAL usage. (This must wait for the workers to finish,
1619 * or we might get incomplete data.)
1624 /* Free last reference to MVCC snapshot, if one was used */
1632 * Returns size of shared memory required to store state for a parallel
1633 * btree index build based on the snapshot its parallel scan will use.
1638 /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
1644 * Within leader, wait for end of heap scan.
1646 * When called, parallel heap scan started by _bt_begin_parallel() will
1647 * already be underway within worker processes (when leader participates
1648 * as a worker, we should end up here just as workers are finishing).
1650 * Fills in fields needed for ambuild statistics, and lets caller set
1651 * field indicating that some worker encountered a broken HOT chain.
1653 * Returns the total number of heap tuples scanned.
1659 int nparticipanttuplesorts;
1678 WAIT_EVENT_PARALLEL_CREATE_INDEX_SCAN);
1687 * Within leader, participate as a parallel worker.
1697 /* Allocate memory and initialize private spool */
1704 /* Initialize second spool, if required */
1706 leaderworker2 = NULL;
1709 /* Allocate memory for worker's own private secondary spool */
1712 /* Initialize worker's own secondary spool */
1713 leaderworker2->
heap = leaderworker->
heap;
1719 * Might as well use reliable figure when doling out maintenance_work_mem
1720 * (when requested number of workers were not launched, this will be
1721 * somewhat higher than it is for other workers).
1725 /* Perform work common to all participants */
1730#ifdef BTREE_BUILD_STATS
1733 ShowUsage(
"BTREE BUILD (Leader Partial Spool) STATISTICS");
1736#endif /* BTREE_BUILD_STATS */
1740 * Perform work within a launched parallel process.
1759#ifdef BTREE_BUILD_STATS
1762#endif /* BTREE_BUILD_STATS */
1765 * The only possible status flag that can be set to the parallel worker is
1771 /* Set debug_query_string for individual workers first */
1775 /* Report the query string from leader */
1778 /* Look up nbtree shared state */
1781 /* Open relations using lock modes known to be obtained by index.c */
1793 /* Track query ID */
1796 /* Open relations within worker */
1800 /* Initialize worker's own spool */
1802 btspool->
heap = heapRel;
1803 btspool->
index = indexRel;
1807 /* Look up shared state private to tuplesort.c */
1817 /* Allocate memory for worker's own private secondary spool */
1820 /* Initialize worker's own secondary spool */
1824 /* Look up shared state private to tuplesort.c */
1829 /* Prepare to track buffer usage during parallel execution */
1832 /* Perform sorting of spool, and possibly a spool2 */
1835 sharedsort2, sortmem,
false);
1837 /* Report WAL/buffer usage during parallel execution */
1843#ifdef BTREE_BUILD_STATS
1846 ShowUsage(
"BTREE BUILD (Worker Partial Spool) STATISTICS");
1849#endif /* BTREE_BUILD_STATS */
1856 * Perform a worker's portion of a parallel sort.
1858 * This generates a tuplesort for passed btspool, and a second tuplesort
1859 * state if a second btspool is need (i.e. for unique index builds). All
1860 * other spool fields should already be set when this is called.
1862 * sortmem is the amount of working memory to use within each worker,
1865 * When this returns, workers are done, and need only release resources.
1878 /* Initialize local tuplesort coordination state */
1884 /* Begin "partial" tuplesort */
1889 sortmem, coordinate,
1893 * Just as with serial case, there may be a second spool. If so, a
1894 * second, dedicated spool2 partial tuplesort is required.
1901 * We expect that the second one (for dead tuples) won't get very
1902 * full, so we give it only work_mem (unless sortmem is less for
1903 * worker). Worker processes are generally permitted to allocate
1904 * work_mem independently.
1916 /* Fill in buildstate for _bt_build_callback() */
1921 buildstate.
spool = btspool;
1922 buildstate.
spool2 = btspool2;
1926 /* Join parallel scan */
1935 /* Execute this worker's part of the sort */
1949 * Done. Record ambuild statistics, and whether we encountered a broken
1965 /* We can end tuplesorts immediately */
void InitializeParallelDSM(ParallelContext *pcxt)
void WaitForParallelWorkersToFinish(ParallelContext *pcxt)
void LaunchParallelWorkers(ParallelContext *pcxt)
void DestroyParallelContext(ParallelContext *pcxt)
ParallelContext * CreateParallelContext(const char *library_name, const char *function_name, int nworkers)
void WaitForParallelWorkersToAttach(ParallelContext *pcxt)
void pgstat_progress_update_param(int index, int64 val)
void pgstat_progress_update_multi_param(int nparam, const int *index, const int64 *val)
void pgstat_report_query_id(int64 query_id, bool force)
int64 pgstat_get_my_query_id(void)
void pgstat_report_activity(BackendState state, const char *cmd_str)
static Datum values[MAXATTR]
#define RelationGetNumberOfBlocks(reln)
Size PageGetFreeSpace(const PageData *page)
bool PageIndexTupleOverwrite(Page page, OffsetNumber offnum, Item newtup, Size newsize)
PageHeaderData * PageHeader
static Item PageGetItem(const PageData *page, const ItemIdData *itemId)
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
#define PageAddItem(page, item, size, offsetNumber, overwrite, is_heap)
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
BulkWriteState * smgr_bulk_start_rel(Relation rel, ForkNumber forknum)
void smgr_bulk_write(BulkWriteState *bulkstate, BlockNumber blocknum, BulkWriteBuffer buf, bool page_std)
BulkWriteBuffer smgr_bulk_get_buf(BulkWriteState *bulkstate)
void smgr_bulk_finish(BulkWriteState *bulkstate)
#define MAXALIGN_DOWN(LEN)
bool ConditionVariableCancelSleep(void)
void ConditionVariableInit(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
void ConditionVariableSignal(ConditionVariable *cv)
static int compare(const void *arg1, const void *arg2)
bool log_btree_build_stats
Assert(PointerIsAligned(start, uint64))
IndexInfo * BuildIndexInfo(Relation index)
void index_close(Relation relation, LOCKMODE lockmode)
Relation index_open(Oid relationId, LOCKMODE lockmode)
IndexTuple CopyIndexTuple(IndexTuple source)
void InstrAccumParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage)
void InstrStartParallelQuery(void)
#define ItemIdGetLength(itemId)
struct ItemIdData ItemIdData
#define ItemIdSetUnused(itemId)
int32 ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
IndexTupleData * IndexTuple
static Datum index_getattr(IndexTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
struct IndexTupleData IndexTupleData
static Size IndexTupleSize(const IndexTupleData *itup)
#define AccessExclusiveLock
#define ShareUpdateExclusiveLock
void pfree(void *pointer)
void * palloc0(Size size)
MemoryContext CurrentMemoryContext
#define CHECK_FOR_INTERRUPTS()
bool _bt_dedup_save_htid(BTDedupState state, IndexTuple itup)
void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, OffsetNumber baseoff)
IndexTuple _bt_form_posting(IndexTuple base, ItemPointer htids, int nhtids)
void _bt_pageinit(Page page, Size size)
void _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level, bool allequalimage)
#define BTGetDeduplicateItems(relation)
#define BTGetTargetPageFreeSpace(relation)
#define PROGRESS_BTREE_PHASE_PERFORMSORT_2
#define PROGRESS_BTREE_PHASE_LEAF_LOAD
#define P_LEFTMOST(opaque)
#define BTPageGetOpaque(page)
static void BTreeTupleSetDownLink(IndexTuple pivot, BlockNumber blkno)
#define PROGRESS_BTREE_PHASE_INDEXBUILD_TABLESCAN
#define PROGRESS_BTREE_PHASE_PERFORMSORT_1
static uint32 BTreeTupleGetPostingOffset(IndexTuple posting)
#define SK_BT_NULLS_FIRST
BTDedupStateData * BTDedupState
static void BTreeTupleSetNAtts(IndexTuple itup, uint16 nkeyatts, bool heaptid)
#define BTreeTupleGetNAtts(itup, rel)
#define BTREE_NONLEAF_FILLFACTOR
#define PARALLEL_KEY_BUFFER_USAGE
#define ParallelTableScanFromBTShared(shared)
static void _bt_blwritepage(BTWriteState *wstate, BulkWriteBuffer buf, BlockNumber blkno)
static void _bt_sortaddtup(Page page, Size itemsize, IndexTuple itup, OffsetNumber itup_off, bool newfirstdataitem)
static void _bt_slideleft(Page rightmostpage)
static BTPageState * _bt_pagestate(BTWriteState *wstate, uint32 level)
static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
static void _bt_end_parallel(BTLeader *btleader)
#define PARALLEL_KEY_TUPLESORT_SPOOL2
static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, BTShared *btshared, Sharedsort *sharedsort, Sharedsort *sharedsort2, int sortmem, bool progress)
static Size _bt_parallel_estimate_shared(Relation heap, Snapshot snapshot)
struct BTPageState BTPageState
static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate)
static double _bt_parallel_heapscan(BTBuildState *buildstate, bool *brokenhotchain)
static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2)
#define PARALLEL_KEY_BTREE_SHARED
IndexBuildResult * btbuild(Relation heap, Relation index, IndexInfo *indexInfo)
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request)
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size truncextra)
struct BTBuildState BTBuildState
void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc)
static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state)
static double _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, IndexInfo *indexInfo)
static void _bt_spooldestroy(BTSpool *btspool)
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
#define PARALLEL_KEY_TUPLESORT
#define PARALLEL_KEY_QUERY_TEXT
#define PARALLEL_KEY_WAL_USAGE
static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level)
static void _bt_leader_participate_as_worker(BTBuildState *buildstate)
struct BTWriteState BTWriteState
static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull)
void _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace, Page page, IndexTuple newtup)
BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup)
IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key)
int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
bool _bt_allequalimage(Relation rel, bool debugmessage)
#define InvalidOffsetNumber
#define OffsetNumberNext(offsetNumber)
#define OffsetNumberPrev(offsetNumber)
static pairingheap_node * merge(pairingheap *heap, pairingheap_node *a, pairingheap_node *b)
const char * debug_query_string
void ShowUsage(const char *title)
#define PROGRESS_CREATEIDX_TUPLES_TOTAL
#define PROGRESS_SCAN_BLOCKS_DONE
#define PROGRESS_CREATEIDX_TUPLES_DONE
#define PROGRESS_CREATEIDX_SUBPHASE
#define PROGRESS_SCAN_BLOCKS_TOTAL
#define RelationGetRelid(relation)
#define RelationGetDescr(relation)
#define RelationGetRelationName(relation)
#define IndexRelationGetNumberOfKeyAttributes(relation)
void * shm_toc_allocate(shm_toc *toc, Size nbytes)
void shm_toc_insert(shm_toc *toc, uint64 key, void *address)
void * shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
#define shm_toc_estimate_chunk(e, sz)
#define shm_toc_estimate_keys(e, cnt)
Size add_size(Size s1, Size s2)
Size mul_size(Size s1, Size s2)
Snapshot GetTransactionSnapshot(void)
void UnregisterSnapshot(Snapshot snapshot)
Snapshot RegisterSnapshot(Snapshot snapshot)
#define IsMVCCSnapshot(snapshot)
void PrepareSortSupportFromIndexRel(Relation indexRel, bool reverse, SortSupport ssup)
struct SortSupportData * SortSupport
static int ApplySortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
#define SpinLockInit(lock)
#define SpinLockRelease(lock)
#define SpinLockAcquire(lock)
int nparticipanttuplesorts
BufferUsage * bufferusage
OffsetNumber btps_lastoff
struct BTPageState * btps_next
ScanKeyData scankeys[INDEX_MAX_KEYS]
ConditionVariable workersdonecv
Tuplesortstate * sortstate
BulkWriteState * bulkstate
BlockNumber btws_pages_alloced
shm_toc_estimator estimator
void table_close(Relation relation, LOCKMODE lockmode)
Relation table_open(Oid relationId, LOCKMODE lockmode)
TableScanDesc table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan)
Size table_parallelscan_estimate(Relation rel, Snapshot snapshot)
void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, Snapshot snapshot)
static double table_index_build_scan(Relation table_rel, Relation index_rel, IndexInfo *index_info, bool allow_sync, bool progress, IndexBuildCallback callback, void *callback_state, TableScanDesc scan)
void tuplesort_performsort(Tuplesortstate *state)
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
Size tuplesort_estimate_shared(int nWorkers)
void tuplesort_end(Tuplesortstate *state)
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
struct SortCoordinateData * SortCoordinate
IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward)
void tuplesort_putindextuplevalues(Tuplesortstate *state, Relation rel, ItemPointer self, const Datum *values, const bool *isnull)
Tuplesortstate * tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, int workMem, SortCoordinate coordinate, int sortopt)
void ExitParallelMode(void)
void EnterParallelMode(void)