1/*-------------------------------------------------------------------------
4 * Generalized tuple sorting routines.
6 * This module provides a generalized facility for tuple sorting, which can be
7 * applied to different kinds of sortable objects. Implementation of
8 * the particular sorting variants is given in tuplesortvariants.c.
9 * This module works efficiently for both small and large amounts
10 * of data. Small amounts are sorted in-memory using qsort(). Large
11 * amounts are sorted using temporary files and a standard external sort
14 * See Knuth, volume 3, for more than you want to know about external
15 * sorting algorithms. The algorithm we use is a balanced k-way merge.
16 * Before PostgreSQL 15, we used the polyphase merge algorithm (Knuth's
17 * Algorithm 5.4.2D), but with modern hardware, a straightforward balanced
18 * merge is better. Knuth is assuming that tape drives are expensive
19 * beasts, and in particular that there will always be many more runs than
20 * tape drives. The polyphase merge algorithm was good at keeping all the
21 * tape drives busy, but in our implementation a "tape drive" doesn't cost
22 * much more than a few Kb of memory buffers, so we can afford to have
23 * lots of them. In particular, if we can have as many tape drives as
24 * sorted runs, we can eliminate any repeated I/O at all.
26 * Historically, we divided the input into sorted runs using replacement
27 * selection, in the form of a priority tree implemented as a heap
28 * (essentially Knuth's Algorithm 5.2.3H), but now we always use quicksort
31 * The approximate amount of memory allowed for any one sort operation
32 * is specified in kilobytes by the caller (most pass work_mem). Initially,
33 * we absorb tuples and simply store them in an unsorted array as long as
34 * we haven't exceeded workMem. If we reach the end of the input without
35 * exceeding workMem, we sort the array using qsort() and subsequently return
36 * tuples just by scanning the tuple array sequentially. If we do exceed
37 * workMem, we begin to emit tuples into sorted runs in temporary tapes.
38 * When tuples are dumped in batch after quicksorting, we begin a new run
39 * with a new output tape. If we reach the max number of tapes, we write
40 * subsequent runs on the existing tapes in a round-robin fashion. We will
41 * need multiple merge passes to finish the merge in that case. After the
42 * end of the input is reached, we dump out remaining tuples in memory into
43 * a final run, then merge the runs.
45 * When merging runs, we use a heap containing just the frontmost tuple from
46 * each source run; we repeatedly output the smallest tuple and replace it
47 * with the next tuple from its source tape (if any). When the heap empties,
48 * the merge is complete. The basic merge algorithm thus needs very little
49 * memory --- only M tuples for an M-way merge, and M is constrained to a
50 * small number. However, we can still make good use of our full workMem
51 * allocation by pre-reading additional blocks from each source tape. Without
52 * prereading, our access pattern to the temporary file would be very erratic;
53 * on average we'd read one block from each of M source tapes during the same
54 * time that we're writing M blocks to the output tape, so there is no
55 * sequentiality of access at all, defeating the read-ahead methods used by
56 * most Unix kernels. Worse, the output tape gets written into a very random
57 * sequence of blocks of the temp file, ensuring that things will be even
58 * worse when it comes time to read that tape. A straightforward merge pass
59 * thus ends up doing a lot of waiting for disk seeks. We can improve matters
60 * by prereading from each source tape sequentially, loading about workMem/M
61 * bytes from each tape in turn, and making the sequential blocks immediately
62 * available for reuse. This approach helps to localize both read and write
63 * accesses. The pre-reading is handled by logtape.c, we just tell it how
64 * much memory to use for the buffers.
66 * In the current code we determine the number of input tapes M on the basis
67 * of workMem: we want workMem/M to be large enough that we read a fair
68 * amount of data each time we read from a tape, so as to maintain the
69 * locality of access described above. Nonetheless, with large workMem we
70 * can have many tapes. The logical "tapes" are implemented by logtape.c,
71 * which avoids space wastage by recycling disk space as soon as each block
72 * is read from its "tape".
74 * When the caller requests random access to the sort result, we form
75 * the final sorted run on a logical tape which is then "frozen", so
76 * that we can access it randomly. When the caller does not need random
77 * access, we return from tuplesort_performsort() as soon as we are down
78 * to one run per logical tape. The final merge is then performed
79 * on-the-fly as the caller repeatedly calls tuplesort_getXXX; this
80 * saves one cycle of writing all the data out to disk and reading it in.
82 * This module supports parallel sorting. Parallel sorts involve coordination
83 * among one or more worker processes, and a leader process, each with its own
84 * tuplesort state. The leader process (or, more accurately, the
85 * Tuplesortstate associated with a leader process) creates a full tapeset
86 * consisting of worker tapes with one run to merge; a run for every
87 * worker process. This is then merged. Worker processes are guaranteed to
88 * produce exactly one output run from their partial input.
91 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
92 * Portions Copyright (c) 1994, Regents of the University of California
95 * src/backend/utils/sort/tuplesort.c
97 *-------------------------------------------------------------------------
114 * Initial size of memtuples array. We're trying to select this size so that
115 * array doesn't exceed ALLOCSET_SEPARATE_THRESHOLD and so that the overhead of
116 * allocation might possibly be lowered. However, we don't consider array sizes
120 #define INITIAL_MEMTUPSIZE Max(1024, \
121 ALLOCSET_SEPARATE_THRESHOLD / sizeof(SortTuple) + 1)
126#ifdef DEBUG_BOUNDED_SORT
127bool optimize_bounded_sort =
true;
132 * During merge, we use a pre-allocated set of fixed-size slots to hold
133 * tuples. To avoid palloc/pfree overhead.
135 * Merge doesn't require a lot of memory, so we can afford to waste some,
136 * by using gratuitously-sized slots. If a tuple is larger than 1 kB, the
137 * palloc() overhead is not significant anymore.
139 * 'nextfree' is valid when this chunk is in the free list. When in use, the
140 * slot holds a tuple.
142 #define SLAB_SLOT_SIZE 1024
151 * Possible states of a Tuplesort object. These denote the states that
152 * persist between calls of Tuplesort routines.
165 * Parameters for calculation of number of tapes to use --- see inittapes()
166 * and tuplesort_merge_order().
168 * In this calculation we assume that each tape will cost us about 1 blocks
169 * worth of buffer space. This ignores the overhead of all the other data
170 * structures needed for each tape, but it's probably close enough.
172 * MERGE_BUFFER_SIZE is how much buffer space we'd like to allocate for each
173 * input tape, for pre-reading (see discussion at top of file). This is *in
174 * addition to* the 1 block already included in TAPE_BUFFER_OVERHEAD.
176 #define MINORDER 6 /* minimum merge order */
177 #define MAXORDER 500 /* maximum merge order */
178 #define TAPE_BUFFER_OVERHEAD BLCKSZ
179 #define MERGE_BUFFER_SIZE (BLCKSZ * 32)
183 * Private state of a Tuplesort operation.
189 bool bounded;
/* did caller specify a maximum number of
190 * tuples to return? */
191 bool boundUsed;
/* true if we made use of a bounded heap */
192 int bound;
/* if bounded, the maximum number of tuples */
194 * storing this separately from what we track
195 * in availMem allows us to subtract the
196 * memory consumed by all tuples when dumping
200 int maxTapes;
/* max number of input tapes to merge in each
203 * of groups, either in-memory or on-disk */
205 * space, false when its value for in-memory
211 * This array holds the tuples now in sort memory. If we are in state
212 * INITIAL, the tuples are in no particular order; if we are in state
213 * SORTEDINMEM, the tuples are in final sorted order; in states BUILDRUNS
214 * and FINALMERGE, the tuples are organized in "heap" order per Algorithm
215 * H. In state SORTEDONTAPE, the array is not used.
223 * Memory for tuples is sometimes allocated using a simple slab allocator,
224 * rather than with palloc(). Currently, we switch to slab allocation
225 * when we start merging. Merging only needs to keep a small, fixed
226 * number of tuples in memory at any time, so we can avoid the
227 * palloc/pfree overhead by recycling a fixed number of fixed-size slots
228 * to hold the tuples.
230 * For the slab, we use one large allocation, divided into SLAB_SLOT_SIZE
231 * slots. The allocation is sized to have one slot per tape, plus one
232 * additional slot. We need that many slots to hold all the tuples kept
233 * in the heap during merge, plus the one we have last returned from the
234 * sort, with tuplesort_gettuple.
236 * Initially, all the slots are kept in a linked list of free slots. When
237 * a tuple is read from a tape, it is put to the next available slot, if
238 * it fits. If the tuple is larger than SLAB_SLOT_SIZE, it is palloc'd
241 * When we're done processing a tuple, we return the slot back to the free
242 * list, or pfree() if it was palloc'd. We know that a tuple was
243 * allocated from the slab, if its pointer value is between
244 * slabMemoryBegin and -End.
246 * When the slab allocator is used, the USEMEM/LACKMEM mechanism of
247 * tracking memory usage is not used.
255 /* Memory used for input and output tape buffers. */
259 * When we return a tuple to the caller in tuplesort_gettuple_XXX, that
260 * came from a tape (that is, in TSS_SORTEDONTAPE or TSS_FINALMERGE
261 * modes), we remember the tuple in 'lastReturnedTuple', so that we can
262 * recycle the memory on next gettuple call.
267 * While building initial runs, this is the current output run number.
268 * Afterwards, it is the number of initial runs we made.
273 * Logical tapes, for merging.
275 * The initial runs are written in the output tapes. In each merge pass,
276 * the output tapes of the previous pass become the input tapes, and new
277 * output tapes are created as needed. When nInputTapes equals
278 * nInputRuns, there is only one merge pass left.
291 * These variables are used after completion of sorting to keep track of
292 * the next tuple to return. (In the tape case, the tape's current read
293 * position is also critical state.)
296 int current;
/* array index (only used if SORTEDINMEM) */
299 /* markpos_xxx holds marked position for mark and restore */
305 * These variables are used during parallel sorting.
307 * worker is our worker identifier. Follows the general convention that
308 * -1 value relates to a leader tuplesort, and values >= 0 worker
309 * tuplesorts. (-1 can also be a serial tuplesort.)
311 * shared is mutable shared memory state, which is used to coordinate
314 * nParticipants is the number of worker Tuplesortstates known by the
315 * leader to have actually been launched, which implies that they must
316 * finish a run that the leader needs to merge. Typically includes a
317 * worker state held by the leader process itself. Set in the leader
318 * Tuplesortstate only.
325 * Additional state for managing "abbreviated key" sortsupport routines
326 * (which currently may be used by all cases except the hash index case).
327 * Tracks the intervals at which the optimization's effectiveness is
334 * Resource snapshot for time of sort start.
340 * Private mutable state of tuplesort-parallel-operation. This is allocated
345 /* mutex protects all fields prior to tapes */
349 * currentWorker generates ordinal identifier numbers for parallel sort
350 * workers. These start from 0, and are always gapless.
352 * Workers increment workersFinished to indicate having finished. If this
353 * is equal to state.nParticipants within the leader, leader is ready to
359 /* Temporary file space */
362 /* Size of tapes flexible array */
366 * Tapes array used by workers to report back information needed by the
367 * leader to concatenate all worker tapes into one for merging
373 * Is the given tuple allocated from the slab memory arena?
375 #define IS_SLAB_SLOT(state, tuple) \
376 ((char *) (tuple) >= (state)->slabMemoryBegin && \
377 (char *) (tuple) < (state)->slabMemoryEnd)
380 * Return the given tuple to the slab memory free list, or free it
381 * if it was palloc'd.
383 #define RELEASE_SLAB_SLOT(state, tuple) \
385 SlabSlot *buf = (SlabSlot *) tuple; \
387 if (IS_SLAB_SLOT((state), buf)) \
389 buf->nextfree = (state)->slabFreeHead; \
390 (state)->slabFreeHead = buf; \
395 #define REMOVEABBREV(state,stup,count) ((*(state)->base.removeabbrev) (state, stup, count))
396 #define COMPARETUP(state,a,b) ((*(state)->base.comparetup) (a, b, state))
397 #define WRITETUP(state,tape,stup) ((*(state)->base.writetup) (state, tape, stup))
398 #define READTUP(state,stup,tape,len) ((*(state)->base.readtup) (state, stup, tape, len))
399 #define FREESTATE(state) ((state)->base.freestate ? (*(state)->base.freestate) (state) : (void) 0)
400 #define LACKMEM(state) ((state)->availMem < 0 && !(state)->slabAllocatorUsed)
401 #define USEMEM(state,amt) ((state)->availMem -= (amt))
402 #define FREEMEM(state,amt) ((state)->availMem += (amt))
403 #define SERIAL(state) ((state)->shared == NULL)
404 #define WORKER(state) ((state)->shared && (state)->worker != -1)
405 #define LEADER(state) ((state)->shared && (state)->worker == -1)
408 * NOTES about on-tape representation of tuples:
410 * We require the first "unsigned int" of a stored tuple to be the total size
411 * on-tape of the tuple, including itself (so it is never zero; an all-zero
412 * unsigned int is used to delimit runs). The remainder of the stored tuple
413 * may or may not match the in-memory representation of the tuple ---
414 * any conversion needed is the job of the writetup and readtup routines.
416 * If state->sortopt contains TUPLESORT_RANDOMACCESS, then the stored
417 * representation of the tuple must be followed by another "unsigned int" that
418 * is a copy of the length --- so the total tape space used is actually
419 * sizeof(unsigned int) more than the stored length value. This allows
420 * read-backwards. When the random access flag was not specified, the
421 * write/read routines may omit the extra length word.
423 * writetup is expected to write both length words as well as the tuple
424 * data. When readtup is called, the tape is positioned just after the
425 * front length word; readtup must read the tuple data and advance past
426 * the back length word (if present).
428 * The write/read routines can make use of the tuple description data
429 * stored in the Tuplesortstate record, if needed. They are also expected
430 * to adjust state->availMem by the amount of memory space (not tape space!)
431 * released or consumed. There is no error return from either writetup
432 * or readtup; they should ereport() on failure.
435 * NOTES about memory consumption calculations:
437 * We count space allocated for tuples against the workMem limit, plus
438 * the space used by the variable-size memtuples array. Fixed-size space
439 * is not counted; it's small enough to not be interesting.
441 * Note that we count actual space used (as shown by GetMemoryChunkSpace)
442 * rather than the originally-requested size. This is important since
443 * palloc can add substantial overhead. It's not a complete answer since
444 * we won't count any wasted space in palloc allocation blocks, but it's
445 * a lot better than what we were doing before 7.3. As of 9.6, a
446 * separate memory context is used for caller passed tuples. Resetting
447 * it at certain key increments significantly ameliorates fragmentation.
448 * readtup routines use the slab allocator (they cannot use
449 * the reset context because it gets deleted at the point that merging
483 * Specialized comparators that we can inline into specialized sorts. The goal
484 * is to try to sort two tuples without having to follow the pointers to the
485 * comparator or the tuple.
487 * XXX: For now, there is no specialization for cases where datum1 is
488 * authoritative and we don't even need to fall back to a callback at all (that
489 * would be true for types like int4/int8/timestamp/date, but not true for
490 * abbreviations of text or multi-key sorts. There could be! Is it worth it?
493/* Used if first key's comparator is ssup_datum_unsigned_cmp */
500 b->datum1,
b->isnull1,
501 &
state->base.sortKeys[0]);
506 * No need to waste effort calling the tiebreak function when there are no
507 * other keys to sort on.
509 if (
state->base.onlyKey != NULL)
515/* Used if first key's comparator is ssup_datum_signed_cmp */
522 b->datum1,
b->isnull1,
523 &
state->base.sortKeys[0]);
529 * No need to waste effort calling the tiebreak function when there are no
530 * other keys to sort on.
532 if (
state->base.onlyKey != NULL)
538/* Used if first key's comparator is ssup_datum_int32_cmp */
545 b->datum1,
b->isnull1,
546 &
state->base.sortKeys[0]);
552 * No need to waste effort calling the tiebreak function when there are no
553 * other keys to sort on.
555 if (
state->base.onlyKey != NULL)
562 * Special versions of qsort just for SortTuple objects. qsort_tuple() sorts
563 * any variant of SortTuples, using the appropriate comparetup function.
564 * qsort_ssup() is specialized for the case where the comparetup function
565 * reduces to ApplySortComparator(), that is single-key MinimalTuple sorts
566 * and Datum sorts. qsort_tuple_{unsigned,signed,int32} are specialized for
567 * common comparison functions on pass-by-value leading datums.
570#define ST_SORT qsort_tuple_unsigned
571#define ST_ELEMENT_TYPE SortTuple
572#define ST_COMPARE(a, b, state) qsort_tuple_unsigned_compare(a, b, state)
573#define ST_COMPARE_ARG_TYPE Tuplesortstate
574#define ST_CHECK_FOR_INTERRUPTS
575#define ST_SCOPE static
579#define ST_SORT qsort_tuple_signed
580#define ST_ELEMENT_TYPE SortTuple
581#define ST_COMPARE(a, b, state) qsort_tuple_signed_compare(a, b, state)
582#define ST_COMPARE_ARG_TYPE Tuplesortstate
583#define ST_CHECK_FOR_INTERRUPTS
584#define ST_SCOPE static
588#define ST_SORT qsort_tuple_int32
589#define ST_ELEMENT_TYPE SortTuple
590#define ST_COMPARE(a, b, state) qsort_tuple_int32_compare(a, b, state)
591#define ST_COMPARE_ARG_TYPE Tuplesortstate
592#define ST_CHECK_FOR_INTERRUPTS
593#define ST_SCOPE static
597#define ST_SORT qsort_tuple
598#define ST_ELEMENT_TYPE SortTuple
599 #define ST_COMPARE_RUNTIME_POINTER
600#define ST_COMPARE_ARG_TYPE Tuplesortstate
601#define ST_CHECK_FOR_INTERRUPTS
602#define ST_SCOPE static
607 #define ST_SORT qsort_ssup
608 #define ST_ELEMENT_TYPE SortTuple
609 #define ST_COMPARE(a, b, ssup) \
610 ApplySortComparator((a)->datum1, (a)->isnull1, \
611 (b)->datum1, (b)->isnull1, (ssup))
612 #define ST_COMPARE_ARG_TYPE SortSupportData
613 #define ST_CHECK_FOR_INTERRUPTS
614 #define ST_SCOPE static
619 * tuplesort_begin_xxx
621 * Initialize for a tuple sort operation.
623 * After calling tuplesort_begin, the caller should call tuplesort_putXXX
624 * zero or more times, then call tuplesort_performsort when all the tuples
625 * have been supplied. After performsort, retrieve the tuples in sorted
626 * order by calling tuplesort_getXXX until it returns false/NULL. (If random
627 * access was requested, rescan, markpos, and restorepos can also be called.)
628 * Call tuplesort_end to terminate the operation and release memory/disk space.
630 * Each variant of tuplesort_begin has a workMem parameter specifying the
631 * maximum number of kilobytes of RAM to use before spilling data to disk.
632 * (The normal value of this parameter is work_mem, but some callers use
633 * other values.) Each variant also has a sortopt which is a bitmask of
634 * sort options. See TUPLESORT_* definitions in tuplesort.h
645 /* See leader_takeover_tapes() remarks on random access support */
647 elog(
ERROR,
"random access disallowed under parallel sort");
650 * Memory context surviving tuplesort_reset. This memory context holds
651 * data which is useful to keep while sorting multiple similar batches.
658 * Create a working memory context for one sort operation. The content of
659 * this context is deleted by tuplesort_reset.
666 * Additionally a working memory context for tuples is setup in
667 * tuplesort_begin_batch.
671 * Make the Tuplesortstate within the per-sortstate context. This way, we
672 * don't need a separate pfree() operation for it at shutdown.
681 state->base.sortopt = sortopt;
682 state->base.tuples =
true;
683 state->abbrevNext = 10;
686 * workMem is forced to be at least 64KB, the current minimum valid value
687 * for the work_mem GUC. This is a defense against parallel sort callers
688 * that divide out memory among many workers in a way that leaves each
689 * with very little memory.
692 state->base.sortcontext = sortcontext;
693 state->base.maincontext = maincontext;
696 * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
697 * see comments in grow_memtuples().
700 state->memtuples = NULL;
703 * After all of the other non-parallel-related state, we setup all of the
704 * state needed for each batch.
709 * Initialize parallel-related state based on coordination information
715 state->shared = NULL;
717 state->nParticipants = -1;
721 /* Parallel worker produces exactly one final run from all input */
724 state->nParticipants = -1;
728 /* Parallel leader state only used for final merge */
741 * tuplesort_begin_batch
743 * Setup, or reset, all state need for processing a new set of tuples with this
744 * sort state. Called both from tuplesort_begin_common (the first time sorting
745 * with this sort state) and tuplesort_reset (for subsequent usages).
755 * Caller tuple (e.g. IndexTuple) memory context.
757 * A dedicated child context used exclusively for caller passed tuples
758 * eases memory management. Resetting at key points reduces
759 * fragmentation. Note that the memtuples array of SortTuples is allocated
760 * in the parent context, not this context, because there is no need to
761 * free memtuples early. For bounded sorts, tuples may be pfreed in any
762 * order, so we use a regular aset.c context so that it can make use of
763 * free'd memory. When the sort is not bounded, we make use of a bump.c
764 * context as this keeps allocations more compact with less wastage.
765 * Allocations are also slightly more CPU efficient.
778 state->bounded =
false;
779 state->boundUsed =
false;
783 state->tapeset = NULL;
785 state->memtupcount = 0;
788 * Initial size of array must be more than ALLOCSET_SEPARATE_THRESHOLD;
789 * see comments in grow_memtuples().
791 state->growmemtuples =
true;
792 state->slabAllocatorUsed =
false;
796 state->memtuples = NULL;
799 if (
state->memtuples == NULL)
805 /* workMem must be large enough for the minimal memtuples array */
807 elog(
ERROR,
"insufficient memory allowed for sort");
809 state->currentRun = 0;
812 * Tape variables (inputTapes, outputTapes, etc.) will be initialized by
813 * inittapes(), if needed.
816 state->result_tape = NULL;
/* flag that result tape has not been formed */
822 * tuplesort_set_bound
824 * Advise tuplesort that at most the first N result tuples are required.
826 * Must be called before inserting any tuples. (Actually, we could allow it
827 * as long as the sort hasn't spilled to disk, but there seems no need for
828 * delayed calls at the moment.)
830 * This is a hint only. The tuplesort may still return more tuples than
831 * requested. Parallel leader tuplesorts will always ignore the hint.
836 /* Assert we're called before loading any tuples */
838 /* Assert we allow bounded sorts */
840 /* Can't set the bound twice, either */
842 /* Also, this shouldn't be called in a parallel worker */
845 /* Parallel leader allows but ignores hint */
849#ifdef DEBUG_BOUNDED_SORT
850 /* Honor GUC setting that disables the feature (for easy testing) */
851 if (!optimize_bounded_sort)
855 /* We want to be able to compute bound * 2, so limit the setting */
856 if (bound > (
int64) (INT_MAX / 2))
859 state->bounded =
true;
860 state->bound = (int) bound;
863 * Bounded sorts are not an effective target for abbreviated key
864 * optimization. Disable by setting state to be consistent with no
865 * abbreviation support.
867 state->base.sortKeys->abbrev_converter = NULL;
868 if (
state->base.sortKeys->abbrev_full_comparator)
869 state->base.sortKeys->comparator =
state->base.sortKeys->abbrev_full_comparator;
871 /* Not strictly necessary, but be tidy */
872 state->base.sortKeys->abbrev_abort = NULL;
873 state->base.sortKeys->abbrev_full_comparator = NULL;
877 * tuplesort_used_bound
879 * Allow callers to find out if the sort state was able to use a bound.
884 return state->boundUsed;
890 * Internal routine for freeing resources of tuplesort.
895 /* context swap probably not needed, but let's be safe */
902 spaceUsed = (
state->allowedMem -
state->availMem + 1023) / 1024;
905 * Delete temporary "tape" files, if any.
907 * We don't bother to destroy the individual tapes here. They will go away
908 * with the sortcontext. (In TSS_FINALMERGE state, we have closed
909 * finished tapes already.)
917 elog(
LOG,
"%s of worker %d ended, %" PRId64
" disk blocks used: %s",
918 SERIAL(
state) ?
"external sort" :
"parallel external sort",
921 elog(
LOG,
"%s of worker %d ended, %" PRId64
" KB used: %s",
922 SERIAL(
state) ?
"internal sort" :
"unperformed parallel sort",
926 TRACE_POSTGRESQL_SORT_DONE(
state->tapeset != NULL, spaceUsed);
932 * Free the per-sort memory context, thereby releasing all working memory.
940 * Release resources and clean up.
942 * NOTE: after calling this, any pointers returned by tuplesort_getXXX are
943 * pointing to garbage. Be careful not to attempt to use or free such
944 * pointers afterwards!
952 * Free the main memory context, including the Tuplesortstate struct
959 * tuplesort_updatemax
961 * Update maximum resource usage statistics.
970 * Note: it might seem we should provide both memory and disk usage for a
971 * disk-based sort. However, the current code doesn't track memory space
972 * accurately once we have begun to return tuples to the caller (since we
973 * don't account for pfree's the caller is expected to do), so we cannot
974 * rely on availMem in a disk sort. This does not seem worth the overhead
975 * to fix. Is it worth creating an API for the memory context code to
976 * tell us how much is actually used in sortcontext?
986 spaceUsed =
state->allowedMem -
state->availMem;
990 * Sort evicts data to the disk when it wasn't able to fit that data into
991 * main memory. This is why we assume space used on the disk to be more
992 * important for tracking resource usage than space used in memory. Note
993 * that the amount of space occupied by some tupleset on the disk might be
994 * less than amount of space occupied by the same tupleset in memory due
995 * to more compact representation.
997 if ((isSpaceDisk && !
state->isMaxSpaceDisk) ||
998 (isSpaceDisk ==
state->isMaxSpaceDisk && spaceUsed >
state->maxSpace))
1000 state->maxSpace = spaceUsed;
1001 state->isMaxSpaceDisk = isSpaceDisk;
1009 * Reset the tuplesort. Reset all the data in the tuplesort, but leave the
1010 * meta-information in. After tuplesort_reset, tuplesort is ready to start
1011 * a new sort. This allows avoiding recreation of tuple sort states (and
1012 * save resources) when sorting multiple small batches.
1021 * After we've freed up per-batch memory, re-setup all of the state common
1022 * to both the first batch and any subsequent batch.
1026 state->lastReturnedTuple = NULL;
1027 state->slabMemoryBegin = NULL;
1028 state->slabMemoryEnd = NULL;
1029 state->slabFreeHead = NULL;
1033 * Grow the memtuples[] array, if possible within our memory constraint. We
1034 * must not exceed INT_MAX tuples in memory or the caller-provided memory
1035 * limit. Return true if we were able to enlarge the array, false if not.
1037 * Normally, at each increment we double the size of the array. When doing
1038 * that would exceed a limit, we attempt one last, smaller increase (and then
1039 * clear the growmemtuples flag so we don't try any more). That allows us to
1040 * use memory as fully as permitted; sticking to the pure doubling rule could
1041 * result in almost half going unused. Because availMem moves around with
1042 * tuple addition/removal, we need some rule to prevent making repeated small
1043 * increases in memtupsize, which would just be useless thrashing. The
1044 * growmemtuples flag accomplishes that and also prevents useless
1045 * recalculations in this function.
1051 int memtupsize =
state->memtupsize;
1054 /* Forget it if we've already maxed out memtuples, per comment above */
1055 if (!
state->growmemtuples)
1058 /* Select new value of memtupsize */
1059 if (memNowUsed <= state->availMem)
1062 * We've used no more than half of allowedMem; double our usage,
1063 * clamping at INT_MAX tuples.
1065 if (memtupsize < INT_MAX / 2)
1066 newmemtupsize = memtupsize * 2;
1069 newmemtupsize = INT_MAX;
1070 state->growmemtuples =
false;
1076 * This will be the last increment of memtupsize. Abandon doubling
1077 * strategy and instead increase as much as we safely can.
1079 * To stay within allowedMem, we can't increase memtupsize by more
1080 * than availMem / sizeof(SortTuple) elements. In practice, we want
1081 * to increase it by considerably less, because we need to leave some
1082 * space for the tuples to which the new array slots will refer. We
1083 * assume the new tuples will be about the same size as the tuples
1084 * we've already seen, and thus we can extrapolate from the space
1085 * consumption so far to estimate an appropriate new size for the
1086 * memtuples array. The optimal value might be higher or lower than
1087 * this estimate, but it's hard to know that in advance. We again
1088 * clamp at INT_MAX tuples.
1090 * This calculation is safe against enlarging the array so much that
1091 * LACKMEM becomes true, because the memory currently used includes
1092 * the present array; thus, there would be enough allowedMem for the
1093 * new array elements even if no other memory were currently used.
1095 * We do the arithmetic in float8, because otherwise the product of
1096 * memtupsize and allowedMem could overflow. Any inaccuracy in the
1097 * result should be insignificant; but even if we computed a
1098 * completely insane result, the checks below will prevent anything
1099 * really bad from happening.
1103 grow_ratio = (double)
state->allowedMem / (
double) memNowUsed;
1104 if (memtupsize * grow_ratio < INT_MAX)
1105 newmemtupsize = (int) (memtupsize * grow_ratio);
1107 newmemtupsize = INT_MAX;
1109 /* We won't make any further enlargement attempts */
1110 state->growmemtuples =
false;
1113 /* Must enlarge array by at least one element, else report failure */
1114 if (newmemtupsize <= memtupsize)
1118 * On a 32-bit machine, allowedMem could exceed MaxAllocHugeSize. Clamp
1119 * to ensure our request won't be rejected. Note that we can easily
1120 * exhaust address space before facing this outcome. (This is presently
1121 * impossible due to guc.c's MAX_KILOBYTES limitation on work_mem, but
1122 * don't rely on that at this distance.)
1127 state->growmemtuples =
false;
/* can't grow any more */
1131 * We need to be sure that we do not cause LACKMEM to become true, else
1132 * the space management algorithm will go nuts. The code above should
1133 * never generate a dangerous request, but to be safe, check explicitly
1134 * that the array growth fits within availMem. (We could still cause
1135 * LACKMEM if the memory chunk overhead associated with the memtuples
1136 * array were to increase. That shouldn't happen because we chose the
1137 * initial array size large enough to ensure that palloc will be treating
1138 * both old and new arrays as separate chunks. But we'll check LACKMEM
1139 * explicitly below just in case.)
1146 state->memtupsize = newmemtupsize;
1152 elog(
ERROR,
"unexpected out-of-memory situation in tuplesort");
1156 /* If for any reason we didn't realloc, shut off future attempts */
1157 state->growmemtuples =
false;
1162 * Shared code for tuple and datum cases.
1166 bool useAbbrev,
Size tuplen)
1172 /* account for the memory used for this tuple */
1174 state->tupleMem += tuplen;
1179 * Leave ordinary Datum representation, or NULL value. If there is a
1180 * converter it won't expect NULL values, and cost model is not
1181 * required to account for NULL, so in that case we avoid calling
1182 * converter and just set datum1 to zeroed representation (to be
1183 * consistent, and to support cheap inequality tests for NULL
1184 * abbreviated keys).
1189 /* Store abbreviated key representation */
1191 state->base.sortKeys);
1196 * Set state to be consistent with never trying abbreviation.
1198 * Alter datum1 representation in already-copied tuples, so as to
1199 * ensure a consistent representation (current tuple was just
1200 * handled). It does not matter if some dumped tuples are already
1201 * sorted on tape, since serialized tuples lack abbreviated keys
1202 * (TSS_BUILDRUNS state prevents control reaching here in any case).
1207 switch (
state->status)
1212 * Save the tuple into the unsorted array. First, grow the array
1213 * as needed. Note that we try to grow the array when there is
1214 * still one free slot remaining --- if we fail, there'll still be
1215 * room to store the incoming tuple, and then we'll switch to
1216 * tape-based operation.
1218 if (
state->memtupcount >=
state->memtupsize - 1)
1223 state->memtuples[
state->memtupcount++] = *tuple;
1226 * Check if it's time to switch over to a bounded heapsort. We do
1227 * so if the input tuple count exceeds twice the desired tuple
1228 * count (this is a heuristic for where heapsort becomes cheaper
1229 * than a quicksort), or if we've just filled workMem and have
1230 * enough tuples to meet the bound.
1232 * Note that once we enter TSS_BOUNDED state we will always try to
1233 * complete the sort that way. In the worst case, if later input
1234 * tuples are larger than earlier ones, this might cause us to
1235 * exceed workMem significantly.
1237 if (
state->bounded &&
1242 elog(
LOG,
"switching to bounded heapsort at %d tuples: %s",
1251 * Done if we still fit in available memory and have array slots.
1260 * Nope; time to switch to tape-based operation.
1273 * We don't want to grow the array here, so check whether the new
1274 * tuple can be discarded before putting it in. This should be a
1275 * good speed optimization, too, since when there are many more
1276 * input tuples than the bound, most input tuples can be discarded
1277 * with just this one comparison. Note that because we currently
1278 * have the sort direction reversed, we must check for <= not >=.
1282 /* new tuple <= top of the heap, so we can discard it */
1288 /* discard top of heap, replacing it with the new tuple */
1297 * Save the tuple into the unsorted array (there must be space)
1299 state->memtuples[
state->memtupcount++] = *tuple;
1302 * If we are over the memory limit, dump all tuples.
1317 Assert(
state->base.sortKeys[0].abbrev_converter != NULL);
1318 Assert(
state->base.sortKeys[0].abbrev_abort != NULL);
1319 Assert(
state->base.sortKeys[0].abbrev_full_comparator != NULL);
1322 * Check effectiveness of abbreviation optimization. Consider aborting
1323 * when still within memory limit.
1328 state->abbrevNext *= 2;
1331 * Check opclass-supplied abbreviation abort routine. It may indicate
1332 * that abbreviation should not proceed.
1334 if (!
state->base.sortKeys->abbrev_abort(
state->memtupcount,
1335 state->base.sortKeys))
1339 * Finally, restore authoritative comparator, and indicate that
1340 * abbreviation is not in play by setting abbrev_converter to NULL
1342 state->base.sortKeys[0].comparator =
state->base.sortKeys[0].abbrev_full_comparator;
1343 state->base.sortKeys[0].abbrev_converter = NULL;
1344 /* Not strictly necessary, but be tidy */
1345 state->base.sortKeys[0].abbrev_abort = NULL;
1346 state->base.sortKeys[0].abbrev_full_comparator = NULL;
1348 /* Give up - expect original pass-by-value representation */
1356 * All tuples have been provided; finish the sort.
1364 elog(
LOG,
"performsort of worker %d starting: %s",
1367 switch (
state->status)
1372 * We were able to accumulate all the tuples within the allowed
1373 * amount of memory, or leader to take over worker tapes
1377 /* Just qsort 'em and we're done */
1384 * Parallel workers must still dump out tuples to tape. No
1385 * merge is required to produce single output run, though.
1395 * Leader will take over worker tapes and merge worker runs.
1396 * Note that mergeruns sets the correct state->status.
1402 state->eof_reached =
false;
1403 state->markpos_block = 0L;
1404 state->markpos_offset = 0;
1405 state->markpos_eof =
false;
1411 * We were able to accumulate all the tuples required for output
1412 * in memory, using a heap to eliminate excess tuples. Now we
1413 * have to transform the heap to a properly-sorted array. Note
1414 * that sort_bounded_heap sets the correct state->status.
1418 state->eof_reached =
false;
1419 state->markpos_offset = 0;
1420 state->markpos_eof =
false;
1426 * Finish tape-based sort. First, flush all tuples remaining in
1427 * memory out to tape; then merge until we have a single remaining
1428 * run (or, if !randomAccess and !WORKER(), one run per tape).
1429 * Note that mergeruns sets the correct state->status.
1433 state->eof_reached =
false;
1434 state->markpos_block = 0L;
1435 state->markpos_offset = 0;
1436 state->markpos_eof =
false;
1447 elog(
LOG,
"performsort of worker %d done (except %d-way final merge): %s",
1451 elog(
LOG,
"performsort of worker %d done: %s",
1459 * Internal routine to fetch the next tuple in either forward or back
1460 * direction into *stup. Returns false if no more tuples.
1461 * Returned tuple belongs to tuplesort memory context, and must not be freed
1462 * by caller. Note that fetched tuple is stored in memory that may be
1463 * recycled by any future fetch.
1469 unsigned int tuplen;
1474 switch (
state->status)
1486 state->eof_reached =
true;
1489 * Complain if caller tries to retrieve more tuples than
1490 * originally asked for in a bounded sort. This is because
1491 * returning EOF here might be the wrong thing.
1494 elog(
ERROR,
"retrieved too many tuples in a bounded sort");
1500 if (
state->current <= 0)
1504 * if all tuples are fetched already then we return last
1505 * tuple, else - tuple before last returned.
1507 if (
state->eof_reached)
1508 state->eof_reached =
false;
1511 state->current--;
/* last returned tuple */
1512 if (
state->current <= 0)
1515 *stup =
state->memtuples[
state->current - 1];
1525 * The slot that held the tuple that we returned in previous
1526 * gettuple call can now be reused.
1528 if (
state->lastReturnedTuple)
1531 state->lastReturnedTuple = NULL;
1536 if (
state->eof_reached)
1539 if ((tuplen =
getlen(
state->result_tape,
true)) != 0)
1544 * Remember the tuple we return, so that we can recycle
1545 * its memory on next call. (This can be NULL, in the
1546 * !state->tuples case).
1554 state->eof_reached =
true;
1562 * if all tuples are fetched already then we return last tuple,
1563 * else - tuple before last returned.
1565 if (
state->eof_reached)
1568 * Seek position is pointing just past the zero tuplen at the
1569 * end of file; back up to fetch last tuple's ending length
1570 * word. If seek fails we must have a completely empty file.
1573 2 *
sizeof(
unsigned int));
1576 else if (nmoved != 2 *
sizeof(
unsigned int))
1577 elog(
ERROR,
"unexpected tape position");
1578 state->eof_reached =
false;
1583 * Back up and fetch previously-returned tuple's ending length
1584 * word. If seek fails, assume we are at start of file.
1587 sizeof(
unsigned int));
1590 else if (nmoved !=
sizeof(
unsigned int))
1591 elog(
ERROR,
"unexpected tape position");
1595 * Back up to get ending length word of tuple before it.
1598 tuplen + 2 *
sizeof(
unsigned int));
1599 if (nmoved == tuplen +
sizeof(
unsigned int))
1602 * We backed up over the previous tuple, but there was no
1603 * ending length word before it. That means that the prev
1604 * tuple is the first tuple in the file. It is now the
1605 * next to read in forward direction (not obviously right,
1606 * but that is what in-memory case does).
1610 else if (nmoved != tuplen + 2 *
sizeof(
unsigned int))
1611 elog(
ERROR,
"bogus tuple length in backward scan");
1617 * Now we have the length of the prior tuple, back up and read it.
1618 * Note: READTUP expects we are positioned after the initial
1619 * length word of the tuple, so back up to that point.
1623 if (nmoved != tuplen)
1624 elog(
ERROR,
"bogus tuple length in backward scan");
1628 * Remember the tuple we return, so that we can recycle its memory
1629 * on next call. (This can be NULL, in the Datum case).
1637 /* We are managing memory ourselves, with the slab allocator. */
1641 * The slab slot holding the tuple that we returned in previous
1642 * gettuple call can now be reused.
1644 if (
state->lastReturnedTuple)
1647 state->lastReturnedTuple = NULL;
1651 * This code should match the inner loop of mergeonerun().
1653 if (
state->memtupcount > 0)
1655 int srcTapeIndex =
state->memtuples[0].srctape;
1659 *stup =
state->memtuples[0];
1662 * Remember the tuple we return, so that we can recycle its
1663 * memory on next call. (This can be NULL, in the Datum case).
1668 * Pull next tuple from tape, and replace the returned tuple
1669 * at top of the heap with it.
1674 * If no more data, we've reached end of run on this tape.
1675 * Remove the top node from the heap.
1678 state->nInputRuns--;
1681 * Close the tape. It'd go away at the end of the sort
1682 * anyway, but better to release the memory early.
1687 newtup.
srctape = srcTapeIndex;
1695 return false;
/* keep compiler quiet */
1701 * Advance over N tuples in either forward or back direction,
1702 * without returning any data. N==0 is a no-op.
1703 * Returns true if successful, false if ran out of tuples.
1711 * We don't actually support backwards skip yet, because no callers need
1712 * it. The API is designed to allow for that later, though.
1718 switch (
state->status)
1721 if (
state->memtupcount -
state->current >= ntuples)
1723 state->current += ntuples;
1727 state->eof_reached =
true;
1730 * Complain if caller tries to retrieve more tuples than
1731 * originally asked for in a bounded sort. This is because
1732 * returning EOF here might be the wrong thing.
1735 elog(
ERROR,
"retrieved too many tuples in a bounded sort");
1743 * We could probably optimize these cases better, but for now it's
1744 * not worth the trouble.
1747 while (ntuples-- > 0)
1763 return false;
/* keep compiler quiet */
1768 * tuplesort_merge_order - report merge order we'll use for given memory
1769 * (note: "merge order" just means the number of input tapes in the merge).
1771 * This is exported for use by the planner. allowedMem is in bytes.
1779 * In the merge phase, we need buffer space for each input and output tape.
1780 * Each pass in the balanced merge algorithm reads from M input tapes, and
1781 * writes to N output tapes. Each tape consumes TAPE_BUFFER_OVERHEAD bytes
1782 * of memory. In addition to that, we want MERGE_BUFFER_SIZE workspace per
1785 * totalMem = M * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE) +
1786 * N * TAPE_BUFFER_OVERHEAD
1788 * Except for the last and next-to-last merge passes, where there can be
1789 * fewer tapes left to process, M = N. We choose M so that we have the
1790 * desired amount of memory available for the input buffers
1791 * (TAPE_BUFFER_OVERHEAD + MERGE_BUFFER_SIZE), given the total memory
1792 * available for the tape buffers (allowedMem).
1794 * Note: you might be thinking we need to account for the memtuples[]
1795 * array in this calculation, but we effectively treat that as part of the
1796 * MERGE_BUFFER_SIZE workspace.
1799 mOrder = allowedMem /
1803 * Even in minimum memory, use at least a MINORDER merge. On the other
1804 * hand, even when we have lots of memory, do not use more than a MAXORDER
1805 * merge. Tapes are pretty cheap, but they're not entirely free. Each
1806 * additional tape reduces the amount of memory available to build runs,
1807 * which in turn can cause the same sort to need more runs, which makes
1808 * merging slower even if it can still be done in a single pass. Also,
1809 * high order merges are quite slow due to CPU cache effects; it can be
1810 * faster to pay the I/O cost of a multi-pass merge than to perform a
1811 * single merge pass across many hundreds of tapes.
1820 * Helper function to calculate how much memory to allocate for the read buffer
1821 * of each input tape in a merge pass.
1823 * 'avail_mem' is the amount of memory available for the buffers of all the
1824 * tapes, both input and output.
1825 * 'nInputTapes' and 'nInputRuns' are the number of input tapes and runs.
1826 * 'maxOutputTapes' is the max. number of output tapes we should produce.
1836 * How many output tapes will we produce in this pass?
1838 * This is nInputRuns / nInputTapes, rounded up.
1840 nOutputRuns = (nInputRuns + nInputTapes - 1) / nInputTapes;
1842 nOutputTapes =
Min(nOutputRuns, maxOutputTapes);
1845 * Each output tape consumes TAPE_BUFFER_OVERHEAD bytes of memory. All
1846 * remaining memory is divided evenly between the input tapes.
1848 * This also follows from the formula in tuplesort_merge_order, but here
1849 * we derive the input buffer size from the amount of memory available,
1856 * inittapes - initialize for tape sorting.
1858 * This is called only if we have found we won't sort in memory.
1867 /* Compute number of input tapes to use when merging */
1872 /* Workers can sometimes produce single run, output without merge */
1878 elog(
LOG,
"worker %d switching to external sort with %d tapes: %s",
1881 /* Create the tape set */
1885 state->shared ? &
state->shared->fileset : NULL,
1888 state->currentRun = 0;
1891 * Initialize logical tape arrays.
1893 state->inputTapes = NULL;
1894 state->nInputTapes = 0;
1895 state->nInputRuns = 0;
1898 state->nOutputTapes = 0;
1899 state->nOutputRuns = 0;
1907 * inittapestate - initialize generic tape management state
1915 * Decrease availMem to reflect the space needed for tape buffers; but
1916 * don't decrease it to the point that we have no room for tuples. (That
1917 * case is only likely to occur if sorting pass-by-value Datums; in all
1918 * other scenarios the memtuples[] array is unlikely to occupy more than
1919 * half of allowedMem. In the pass-by-value case it's not important to
1920 * account for tuple space, so we don't care if LACKMEM becomes
1929 * Make sure that the temp file(s) underlying the tape set are created in
1930 * suitable temp tablespaces. For parallel sorts, this should have been
1931 * called already, but it doesn't matter if it is called a second time.
1937 * selectnewtape -- select next tape to output to.
1939 * This is called after finishing a run when we know another run
1940 * must be started. This is used both when building the initial
1941 * runs, and during merge passes.
1947 * At the beginning of each merge pass, nOutputTapes and nOutputRuns are
1948 * both zero. On each call, we create a new output tape to hold the next
1949 * run, until maxTapes is reached. After that, we assign new runs to the
1950 * existing tapes in a round robin fashion.
1954 /* Create a new tape to hold the next run */
1959 state->nOutputTapes++;
1960 state->nOutputRuns++;
1965 * We have reached the max number of tapes. Append to an existing
1969 state->nOutputRuns++;
1974 * Initialize the slab allocation arena, for the given number of slots.
1985 state->slabMemoryEnd =
state->slabMemoryBegin +
1990 p =
state->slabMemoryBegin;
1991 for (
i = 0;
i < numSlots - 1;
i++)
2000 state->slabMemoryBegin =
state->slabMemoryEnd = NULL;
2001 state->slabFreeHead = NULL;
2003 state->slabAllocatorUsed =
true;
2007 * mergeruns -- merge all the completed initial runs.
2009 * This implements the Balanced k-Way Merge Algorithm. All input data has
2010 * already been written to initial runs on tape (see dumptuples).
2020 if (
state->base.sortKeys != NULL &&
state->base.sortKeys->abbrev_converter != NULL)
2023 * If there are multiple runs to be merged, when we go to read back
2024 * tuples from disk, abbreviated keys will not have been stored, and
2025 * we don't care to regenerate them. Disable abbreviation from this
2028 state->base.sortKeys->abbrev_converter = NULL;
2029 state->base.sortKeys->comparator =
state->base.sortKeys->abbrev_full_comparator;
2031 /* Not strictly necessary, but be tidy */
2032 state->base.sortKeys->abbrev_abort = NULL;
2033 state->base.sortKeys->abbrev_full_comparator = NULL;
2037 * Reset tuple memory. We've freed all the tuples that we previously
2038 * allocated. We will use the slab allocator from now on.
2043 * We no longer need a large memtuples array. (We will allocate a smaller
2044 * one for the heap later.)
2048 state->memtuples = NULL;
2051 * Initialize the slab allocator. We need one slab slot per input tape,
2052 * for the tuples in the heap, plus one to hold the tuple last returned
2053 * from tuplesort_gettuple. (If we're sorting pass-by-val Datums,
2054 * however, we don't need to do allocate anything.)
2056 * In a multi-pass merge, we could shrink this allocation for the last
2057 * merge pass, if it has fewer tapes than previous passes, but we don't
2060 * From this point on, we no longer use the USEMEM()/LACKMEM() mechanism
2061 * to track memory usage of individual tuples.
2063 if (
state->base.tuples)
2069 * Allocate a new 'memtuples' array, for the heap. It will hold one tuple
2070 * from each input tape.
2072 * We could shrink this, too, between passes in a multi-pass merge, but we
2073 * don't bother. (The initial input tapes are still in outputTapes. The
2074 * number of input tapes will not increase between passes.)
2082 * Use all the remaining memory we have available for tape buffers among
2083 * all the input tapes. At the beginning of each merge pass, we will
2084 * divide this memory between the input and output tapes in the pass.
2089 elog(
LOG,
"worker %d using %zu KB of memory for tape buffers",
2090 state->worker,
state->tape_buffer_mem / 1024);
2095 * On the first iteration, or if we have read all the runs from the
2096 * input tapes in a multi-pass merge, it's time to start a new pass.
2097 * Rewind all the output tapes, and make them inputs for the next
2100 if (
state->nInputRuns == 0)
2102 int64 input_buffer_size;
2104 /* Close the old, emptied, input tapes */
2105 if (
state->nInputTapes > 0)
2107 for (tapenum = 0; tapenum <
state->nInputTapes; tapenum++)
2112 /* Previous pass's outputs become next pass's inputs. */
2118 * Reset output tape variables. The actual LogicalTapes will be
2119 * created as needed, here we only allocate the array to hold
2123 state->nOutputTapes = 0;
2124 state->nOutputRuns = 0;
2127 * Redistribute the memory allocated for tape buffers, among the
2128 * new input and output tapes.
2136 elog(
LOG,
"starting merge pass of %d input runs on %d tapes, " INT64_FORMAT " KB of memory for each input tape: %s",
2137 state->nInputRuns,
state->nInputTapes, input_buffer_size / 1024,
2140 /* Prepare the new input tapes for merge pass. */
2141 for (tapenum = 0; tapenum <
state->nInputTapes; tapenum++)
2145 * If there's just one run left on each input tape, then only one
2146 * merge pass remains. If we don't have to produce a materialized
2147 * sorted tape, we can stop at this point and do the final merge
2154 /* Tell logtape.c we won't be writing anymore */
2156 /* Initialize for the final merge pass */
2163 /* Select an output tape */
2166 /* Merge one run from each input tape. */
2170 * If the input tapes are empty, and we output only one output run,
2171 * we're done. The current output tape contains the final result.
2173 if (
state->nInputRuns == 0 &&
state->nOutputRuns <= 1)
2178 * Done. The result is on a single run on a single tape.
2187 /* Close all the now-empty input tapes, to release their read buffers. */
2188 for (tapenum = 0; tapenum <
state->nInputTapes; tapenum++)
2193 * Merge one run from each input tape.
2202 * Start the merge by loading one tuple from each active source tape into
2210 * Execute merge by repeatedly extracting lowest tuple in heap, writing it
2211 * out, and replacing it with next tuple from same tape (if there is
2214 while (
state->memtupcount > 0)
2218 /* write the tuple to destTape */
2219 srcTapeIndex =
state->memtuples[0].srctape;
2220 srcTape =
state->inputTapes[srcTapeIndex];
2223 /* recycle the slot of the tuple we just wrote out, for the next read */
2224 if (
state->memtuples[0].tuple)
2228 * pull next tuple from the tape, and replace the written-out tuple in
2239 state->nInputRuns--;
2244 * When the heap empties, we're done. Write an end-of-run marker on the
2251 * beginmerge - initialize for a merge pass
2253 * Fill the merge heap with the first tuple from each input tape.
2261 /* Heap should be empty here */
2266 for (srcTapeIndex = 0; srcTapeIndex < activeTapes; srcTapeIndex++)
2279 * mergereadnext - read next tuple from one merge input tape
2281 * Returns false on EOF.
2286 unsigned int tuplen;
2288 /* read next tuple, if any */
2289 if ((tuplen =
getlen(srcTape,
true)) == 0)
2297 * dumptuples - remove tuples from memtuples and write initial run to tape
2299 * When alltuples = true, dump everything currently in memory. (This case is
2300 * only used at end of input data.)
2309 * Nothing to do if we still fit in available memory and have array slots,
2310 * unless this is the final call during initial run generation.
2317 * Final call might require no sorting, in rare cases where we just so
2318 * happen to have previously LACKMEM()'d at the point where exactly all
2319 * remaining tuples are loaded into memory, just before input was
2320 * exhausted. In general, short final runs are quite possible, but avoid
2321 * creating a completely empty run. In a worker, though, we must produce
2322 * at least one tape, even if it's empty.
2324 if (
state->memtupcount == 0 &&
state->currentRun > 0)
2330 * It seems unlikely that this limit will ever be exceeded, but take no
2333 if (
state->currentRun == INT_MAX)
2335 (
errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2336 errmsg(
"cannot have more than %d runs for an external sort",
2339 if (
state->currentRun > 0)
2342 state->currentRun++;
2345 elog(
LOG,
"worker %d starting quicksort of run %d: %s",
2350 * Sort all tuples accumulated within the allowed amount of memory for
2351 * this run using quicksort
2356 elog(
LOG,
"worker %d finished quicksort of run %d: %s",
2360 memtupwrite =
state->memtupcount;
2361 for (
i = 0;
i < memtupwrite;
i++)
2368 state->memtupcount = 0;
2371 * Reset tuple memory. We've freed all of the tuples that we previously
2372 * allocated. It's important to avoid fragmentation when there is a stark
2373 * change in the sizes of incoming tuples. In bounded sorts,
2374 * fragmentation due to AllocSetFree's bucketing by size class might be
2375 * particularly bad if this step wasn't taken.
2380 * Now update the memory accounting to subtract the memory used by the
2384 state->tupleMem = 0;
2389 elog(
LOG,
"worker %d finished writing run %d to tape %d: %s",
2395 * tuplesort_rescan - rewind and replay the scan
2404 switch (
state->status)
2408 state->eof_reached =
false;
2409 state->markpos_offset = 0;
2410 state->markpos_eof =
false;
2414 state->eof_reached =
false;
2415 state->markpos_block = 0L;
2416 state->markpos_offset = 0;
2417 state->markpos_eof =
false;
2428 * tuplesort_markpos - saves current position in the merged sort file
2437 switch (
state->status)
2445 &
state->markpos_block,
2446 &
state->markpos_offset);
2458 * tuplesort_restorepos - restores current position in merged sort file to
2459 * last saved position
2468 switch (
state->status)
2476 state->markpos_block,
2477 state->markpos_offset);
2489 * tuplesort_get_stats - extract summary statistics
2491 * This can be called after tuplesort_performsort() finishes to obtain
2492 * printable summary information about how the sort was performed.
2499 * Note: it might seem we should provide both memory and disk usage for a
2500 * disk-based sort. However, the current code doesn't track memory space
2501 * accurately once we have begun to return tuples to the caller (since we
2502 * don't account for pfree's the caller is expected to do), so we cannot
2503 * rely on availMem in a disk sort. This does not seem worth the overhead
2504 * to fix. Is it worth creating an API for the memory context code to
2505 * tell us how much is actually used in sortcontext?
2509 if (
state->isMaxSpaceDisk)
2515 switch (
state->maxSpaceStatus)
2518 if (
state->boundUsed)
2536 * Convert TuplesortMethod to a string.
2544 return "still in progress";
2546 return "top-N heapsort";
2550 return "external sort";
2552 return "external merge";
2559 * Convert TuplesortSpaceType to a string.
2570 * Heap manipulation routines, per Knuth's Algorithm 5.2.3H.
2574 * Convert the existing unordered array of SortTuples to a bounded heap,
2575 * discarding all but the smallest "state->bound" tuples.
2577 * When working with a bounded heap, we want to keep the largest entry
2578 * at the root (array entry zero), instead of the smallest as in the normal
2579 * sort case. This allows us to discard the largest entry cheaply.
2580 * Therefore, we temporarily reverse the sort direction.
2585 int tupcount =
state->memtupcount;
2593 /* Reverse sort direction so largest entry will be at root */
2596 state->memtupcount = 0;
/* make the heap empty */
2597 for (
i = 0;
i < tupcount;
i++)
2601 /* Insert next tuple into heap */
2602 /* Must copy source tuple to avoid possible overwrite */
2610 * The heap is full. Replace the largest entry with the new
2611 * tuple, or just discard it, if it's larger than anything already
2629 * Convert the bounded heap to a properly-sorted array
2634 int tupcount =
state->memtupcount;
2642 * We can unheapify in place because each delete-top call will remove the
2643 * largest entry, which we can promptly store in the newly freed slot at
2644 * the end. Once we're down to a single-entry heap, we're done.
2646 while (
state->memtupcount > 1)
2650 /* this sifts-up the next-largest entry and decreases memtupcount */
2654 state->memtupcount = tupcount;
2657 * Reverse sort direction back to the original state. This is not
2658 * actually necessary but seems like a good idea for tidiness.
2663 state->boundUsed =
true;
2667 * Sort all memtuples using specialized qsort() routines.
2669 * Quicksort is used for small in-memory sorts, and external sort runs.
2676 if (
state->memtupcount > 1)
2679 * Do we have the leading column's value or abbreviation in datum1,
2680 * and is there a specialization for its comparator?
2682 if (
state->base.haveDatum1 &&
state->base.sortKeys)
2686 qsort_tuple_unsigned(
state->memtuples,
2693 qsort_tuple_signed(
state->memtuples,
2700 qsort_tuple_int32(
state->memtuples,
2707 /* Can we use the single-key sort function? */
2708 if (
state->base.onlyKey != NULL)
2710 qsort_ssup(
state->memtuples,
state->memtupcount,
2711 state->base.onlyKey);
2715 qsort_tuple(
state->memtuples,
2717 state->base.comparetup,
2724 * Insert a new tuple into an empty or existing heap, maintaining the
2725 * heap invariant. Caller is responsible for ensuring there's room.
2727 * Note: For some callers, tuple points to a memtuples[] entry above the
2728 * end of the heap. This is safe as long as it's not immediately adjacent
2729 * to the end of the heap (ie, in the [memtupcount] array entry) --- if it
2730 * is, it might get overwritten before being moved into the heap!
2738 memtuples =
state->memtuples;
2744 * Sift-up the new entry, per Knuth 5.2.3 exercise 16. Note that Knuth is
2745 * using 1-based array indexes, not 0-based.
2747 j =
state->memtupcount++;
2750 int i = (
j - 1) >> 1;
2754 memtuples[
j] = memtuples[
i];
2757 memtuples[
j] = *tuple;
2761 * Remove the tuple at state->memtuples[0] from the heap. Decrement
2762 * memtupcount, and sift up to maintain the heap invariant.
2764 * The caller has already free'd the tuple the top node points to,
2773 if (--
state->memtupcount <= 0)
2777 * Remove the last tuple in the heap, and re-insert it, by replacing the
2778 * current top node with it.
2780 tuple = &memtuples[
state->memtupcount];
2785 * Replace the tuple at state->memtuples[0] with a new tuple. Sift up to
2786 * maintain the heap invariant.
2788 * This corresponds to Knuth's "sift-up" algorithm (Algorithm 5.2.3H,
2789 * Heapsort, steps H3-H8).
2803 * state->memtupcount is "int", but we use "unsigned int" for i, j, n.
2804 * This prevents overflow in the "2 * i + 1" calculation, since at the top
2805 * of the loop we must have i < n <= INT_MAX <= UINT_MAX/2.
2807 n =
state->memtupcount;
2808 i = 0;
/* i is where the "hole" is */
2811 unsigned int j = 2 *
i + 1;
2820 memtuples[
i] = memtuples[
j];
2823 memtuples[
i] = *tuple;
2827 * Function to reverse the sort direction from its current state
2829 * It is not safe to call this when performing hash tuplesorts
2837 for (nkey = 0; nkey <
state->base.nKeys; nkey++, sortKey++)
2846 * Tape interface routines
2857 if (
len == 0 && !eofOK)
2865 unsigned int len = 0;
2871 * Get memory for tuple from within READTUP() routine.
2873 * We use next free slot from the slab allocator, or palloc() if the tuple
2874 * is too large for that.
2882 * We pre-allocate enough slots in the slab arena that we should never run
2892 /* Reuse this slot */
2893 state->slabFreeHead =
buf->nextfree;
2901 * Parallel sort routines
2905 * tuplesort_estimate_shared - estimate required shared memory allocation
2907 * nWorkers is an estimate of the number of workers (it's the number that
2908 * will be requested).
2917 /* Make sure that BufFile shared state is MAXALIGN'd */
2925 * tuplesort_initialize_shared - initialize shared tuplesort state
2927 * Must be called from leader process before workers are launched, to
2928 * establish state needed up-front for worker tuplesortstates. nWorkers
2929 * should match the argument passed to tuplesort_estimate_shared().
2942 shared->
nTapes = nWorkers;
2943 for (
i = 0;
i < nWorkers;
i++)
2950 * tuplesort_attach_shared - attach to shared tuplesort state
2952 * Must be called by all worker processes.
2957 /* Attach to SharedFileSet */
2962 * worker_get_identifier - Assign and return ordinal identifier for worker
2964 * The order in which these are assigned is not well defined, and should not
2965 * matter; worker numbers across parallel sort participants need only be
2966 * distinct and gapless. logtape.c requires this.
2968 * Note that the identifiers assigned from here have no relation to
2969 * ParallelWorkerNumber number, to avoid making any assumption about
2970 * caller's requirements. However, we do follow the ParallelWorkerNumber
2971 * convention of representing a non-worker with worker number -1. This
2972 * includes the leader, as well as serial Tuplesort processes.
2990 * worker_freeze_result_tape - freeze worker's result tape for leader
2992 * This is called by workers just after the result tape has been determined,
2993 * instead of calling LogicalTapeFreeze() directly. They do so because
2994 * workers require a few additional steps over similar serial
2995 * TSS_SORTEDONTAPE external sort cases, which also happen here. The extra
2996 * steps are around freeing now unneeded resources, and representing to
2997 * leader that worker's input run is available for its merge.
2999 * There should only be one final output run for each worker, which consists
3000 * of all tuples that were originally input into worker.
3013 * Free most remaining memory, in case caller is sensitive to our holding
3014 * on to it. memtuples may not be a tiny merge heap at this point.
3018 state->memtuples = NULL;
3019 state->memtupsize = 0;
3022 * Parallel worker requires result tape metadata, which is to be stored in
3023 * shared memory for leader
3027 /* Store properties of output tape, and update finished worker count */
3035 * worker_nomergeruns - dump memtuples in worker, without merging
3037 * This called as an alternative to mergeruns() with a worker when no
3038 * merging is required.
3052 * leader_takeover_tapes - create tapeset for leader from worker tapes
3054 * So far, leader Tuplesortstate has performed no actual sorting. By now, all
3055 * sorting has occurred in workers, all of which must have already returned
3056 * from tuplesort_performsort().
3058 * When this returns, leader process is left in a state that is virtually
3059 * indistinguishable from it having generated runs as a serial external sort
3066 int nParticipants =
state->nParticipants;
3067 int workersFinished;
3071 Assert(nParticipants >= 1);
3077 if (nParticipants != workersFinished)
3078 elog(
ERROR,
"cannot take over tapes before all workers finish");
3081 * Create the tapeset from worker tapes, including a leader-owned tape at
3082 * the end. Parallel workers are far more expensive than logical tapes,
3083 * so the number of tapes allocated here should never be excessive.
3089 * Set currentRun to reflect the number of runs we will merge (it's not
3090 * used for anything, this is just pro forma)
3092 state->currentRun = nParticipants;
3095 * Initialize the state to look the same as after building the initial
3098 * There will always be exactly 1 run per worker, and exactly one input
3099 * tape per run, because workers always output exactly 1 run, even when
3100 * there were no input tuples for workers to sort.
3102 state->inputTapes = NULL;
3103 state->nInputTapes = 0;
3104 state->nInputRuns = 0;
3107 state->nOutputTapes = nParticipants;
3108 state->nOutputRuns = nParticipants;
3110 for (
j = 0;
j < nParticipants;
j++)
3119 * Convenience routine to free a tuple previously loaded into sort memory
void PrepareTempTablespaces(void)
MemoryContext BumpContextCreate(MemoryContext parent, const char *name, Size minContextSize, Size initBlockSize, Size maxBlockSize)
#define FLEXIBLE_ARRAY_MEMBER
#define pg_attribute_always_inline
int errcode(int sqlerrcode)
int errmsg(const char *fmt,...)
#define ereport(elevel,...)
static int compare(const void *arg1, const void *arg2)
Assert(PointerIsAligned(start, uint64))
void LogicalTapeRewindForRead(LogicalTape *lt, size_t buffer_size)
void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
size_t LogicalTapeBackspace(LogicalTape *lt, size_t size)
size_t LogicalTapeRead(LogicalTape *lt, void *ptr, size_t size)
int64 LogicalTapeSetBlocks(LogicalTapeSet *lts)
void LogicalTapeClose(LogicalTape *lt)
void LogicalTapeSetClose(LogicalTapeSet *lts)
void LogicalTapeSeek(LogicalTape *lt, int64 blocknum, int offset)
LogicalTapeSet * LogicalTapeSetCreate(bool preallocate, SharedFileSet *fileset, int worker)
void LogicalTapeTell(LogicalTape *lt, int64 *blocknum, int *offset)
void LogicalTapeWrite(LogicalTape *lt, const void *ptr, size_t size)
LogicalTape * LogicalTapeCreate(LogicalTapeSet *lts)
void LogicalTapeFreeze(LogicalTape *lt, TapeShare *share)
LogicalTape * LogicalTapeImport(LogicalTapeSet *lts, int worker, TapeShare *shared)
void * MemoryContextAlloc(MemoryContext context, Size size)
void MemoryContextReset(MemoryContext context)
void pfree(void *pointer)
Size GetMemoryChunkSpace(void *pointer)
void * palloc0(Size size)
MemoryContext CurrentMemoryContext
void MemoryContextDelete(MemoryContext context)
void * repalloc_huge(void *pointer, Size size)
void MemoryContextResetOnly(MemoryContext context)
#define AllocSetContextCreate
#define ALLOCSET_DEFAULT_SIZES
#define CHECK_FOR_INTERRUPTS()
static MemoryContext MemoryContextSwitchTo(MemoryContext context)
const char * pg_rusage_show(const PGRUsage *ru0)
void pg_rusage_init(PGRUsage *ru0)
static int64 DatumGetInt64(Datum X)
static int32 DatumGetInt32(Datum X)
void SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
void SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
Size add_size(Size s1, Size s2)
Size mul_size(Size s1, Size s2)
static int ApplySignedSortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
static int ApplyUnsignedSortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
static int ApplyInt32SortComparator(Datum datum1, bool isNull1, Datum datum2, bool isNull2, SortSupport ssup)
#define SpinLockInit(lock)
#define SpinLockRelease(lock)
#define SpinLockAcquire(lock)
TapeShare tapes[FLEXIBLE_ARRAY_MEMBER]
TuplesortMethod sortMethod
TuplesortSpaceType spaceType
LogicalTape ** inputTapes
LogicalTape ** outputTapes
TupSortStatus maxSpaceStatus
LogicalTape * result_tape
void tuplesort_rescan(Tuplesortstate *state)
void tuplesort_performsort(Tuplesortstate *state)
int tuplesort_merge_order(int64 allowedMem)
#define TAPE_BUFFER_OVERHEAD
static void tuplesort_heap_delete_top(Tuplesortstate *state)
#define INITIAL_MEMTUPSIZE
static unsigned int getlen(LogicalTape *tape, bool eofOK)
void tuplesort_initialize_shared(Sharedsort *shared, int nWorkers, dsm_segment *seg)
#define COMPARETUP(state, a, b)
static void selectnewtape(Tuplesortstate *state)
void tuplesort_reset(Tuplesortstate *state)
static void markrunend(LogicalTape *tape)
bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples, bool forward)
static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup)
#define REMOVEABBREV(state, stup, count)
static void reversedirection(Tuplesortstate *state)
#define USEMEM(state, amt)
static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple)
int ssup_datum_signed_cmp(Datum x, Datum y, SortSupport ssup)
static bool grow_memtuples(Tuplesortstate *state)
int ssup_datum_unsigned_cmp(Datum x, Datum y, SortSupport ssup)
static void beginmerge(Tuplesortstate *state)
static void make_bounded_heap(Tuplesortstate *state)
bool tuplesort_used_bound(Tuplesortstate *state)
#define WRITETUP(state, tape, stup)
static void sort_bounded_heap(Tuplesortstate *state)
static int worker_get_identifier(Tuplesortstate *state)
static void mergeonerun(Tuplesortstate *state)
#define FREEMEM(state, amt)
static void inittapestate(Tuplesortstate *state, int maxTapes)
static void leader_takeover_tapes(Tuplesortstate *state)
Size tuplesort_estimate_shared(int nWorkers)
void tuplesort_get_stats(Tuplesortstate *state, TuplesortInstrumentation *stats)
Tuplesortstate * tuplesort_begin_common(int workMem, SortCoordinate coordinate, int sortopt)
static void tuplesort_sort_memtuples(Tuplesortstate *state)
void tuplesort_end(Tuplesortstate *state)
static void inittapes(Tuplesortstate *state, bool mergeruns)
void tuplesort_markpos(Tuplesortstate *state)
void tuplesort_puttuple_common(Tuplesortstate *state, SortTuple *tuple, bool useAbbrev, Size tuplen)
const char * tuplesort_space_type_name(TuplesortSpaceType t)
#define MERGE_BUFFER_SIZE
#define READTUP(state, stup, tape, len)
int ssup_datum_int32_cmp(Datum x, Datum y, SortSupport ssup)
bool tuplesort_gettuple_common(Tuplesortstate *state, bool forward, SortTuple *stup)
static int64 merge_read_buffer_size(int64 avail_mem, int nInputTapes, int nInputRuns, int maxOutputTapes)
static bool mergereadnext(Tuplesortstate *state, LogicalTape *srcTape, SortTuple *stup)
static void tuplesort_updatemax(Tuplesortstate *state)
static void worker_freeze_result_tape(Tuplesortstate *state)
static pg_attribute_always_inline int qsort_tuple_signed_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
#define RELEASE_SLAB_SLOT(state, tuple)
void tuplesort_attach_shared(Sharedsort *shared, dsm_segment *seg)
static void worker_nomergeruns(Tuplesortstate *state)
const char * tuplesort_method_name(TuplesortMethod m)
static pg_attribute_always_inline int qsort_tuple_unsigned_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
static void tuplesort_heap_replace_top(Tuplesortstate *state, SortTuple *tuple)
void tuplesort_restorepos(Tuplesortstate *state)
static pg_attribute_always_inline int qsort_tuple_int32_compare(SortTuple *a, SortTuple *b, Tuplesortstate *state)
static void mergeruns(Tuplesortstate *state)
void * tuplesort_readtup_alloc(Tuplesortstate *state, Size tuplen)
static void tuplesort_begin_batch(Tuplesortstate *state)
void tuplesort_set_bound(Tuplesortstate *state, int64 bound)
static void init_slab_allocator(Tuplesortstate *state, int numSlots)
static bool consider_abort_common(Tuplesortstate *state)
static void tuplesort_free(Tuplesortstate *state)
static void dumptuples(Tuplesortstate *state, bool alltuples)
#define TupleSortUseBumpTupleCxt(opt)
#define TUPLESORT_RANDOMACCESS
#define TUPLESORT_ALLOWBOUNDED
@ SORT_TYPE_EXTERNAL_SORT
@ SORT_TYPE_TOP_N_HEAPSORT
@ SORT_TYPE_STILL_IN_PROGRESS
@ SORT_TYPE_EXTERNAL_MERGE
char buffer[SLAB_SLOT_SIZE]
union SlabSlot * nextfree