summaryrefslogtreecommitdiff
path: root/src/include
diff options
context:
space:
mode:
authorTomas Vondra <tomas.vondra@postgresql.org>2020-04-06 21:33:28 +0200
committerTomas Vondra <tomas.vondra@postgresql.org>2020-04-06 21:35:10 +0200
commitd2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da (patch)
tree66e2560c49ee43d13c29bd9f5760731001312738 /src/include
parent3c8553547b1493c4afdb80393f4a47dbfa019a79 (diff)
downloadpostgresql-d2d8a229bc58a2014dce1c7a4fcdb6c5ab9fb8da.tar.gz
Implement Incremental Sort
Incremental Sort is an optimized variant of multikey sort for cases when the input is already sorted by a prefix of the requested sort keys. For example when the relation is already sorted by (key1, key2) and we need to sort it by (key1, key2, key3) we can simply split the input rows into groups having equal values in (key1, key2), and only sort/compare the remaining column key3. This has a number of benefits: - Reduced memory consumption, because only a single group (determined by values in the sorted prefix) needs to be kept in memory. This may also eliminate the need to spill to disk. - Lower startup cost, because Incremental Sort produce results after each prefix group, which is beneficial for plans where startup cost matters (like for example queries with LIMIT clause). We consider both Sort and Incremental Sort, and decide based on costing. The implemented algorithm operates in two different modes: - Fetching a minimum number of tuples without check of equality on the prefix keys, and sorting on all columns when safe. - Fetching all tuples for a single prefix group and then sorting by comparing only the remaining (non-prefix) keys. We always start in the first mode, and employ a heuristic to switch into the second mode if we believe it's beneficial - the goal is to minimize the number of unnecessary comparions while keeping memory consumption below work_mem. This is a very old patch series. The idea was originally proposed by Alexander Korotkov back in 2013, and then revived in 2017. In 2018 the patch was taken over by James Coleman, who wrote and rewrote most of the current code. There were many reviewers/contributors since 2013 - I've done my best to pick the most active ones, and listed them in this commit message. Author: James Coleman, Alexander Korotkov Reviewed-by: Tomas Vondra, Andreas Karlsson, Marti Raudsepp, Peter Geoghegan, Robert Haas, Thomas Munro, Antonin Houska, Andres Freund, Alexander Kuzmenkov Discussion: https://postgr.es/m/CAPpHfdscOX5an71nHd8WSUH6GNOCf=V7wgDaTXdDd9=goN-gfA@mail.gmail.com Discussion: https://postgr.es/m/CAPpHfds1waRZ=NOmueYq0sx1ZSCnt+5QJvizT8ndT2=etZEeAQ@mail.gmail.com
Diffstat (limited to 'src/include')
-rw-r--r--src/include/executor/execdebug.h2
-rw-r--r--src/include/executor/nodeIncrementalSort.h28
-rw-r--r--src/include/nodes/execnodes.h80
-rw-r--r--src/include/nodes/nodes.h3
-rw-r--r--src/include/nodes/pathnodes.h9
-rw-r--r--src/include/nodes/plannodes.h10
-rw-r--r--src/include/optimizer/cost.h6
-rw-r--r--src/include/optimizer/pathnode.h6
-rw-r--r--src/include/optimizer/paths.h1
-rw-r--r--src/include/utils/tuplesort.h16
10 files changed, 156 insertions, 5 deletions
diff --git a/src/include/executor/execdebug.h b/src/include/executor/execdebug.h
index 2e9920111f..4af6e0013d 100644
--- a/src/include/executor/execdebug.h
+++ b/src/include/executor/execdebug.h
@@ -86,10 +86,12 @@
#define SO_nodeDisplay(l) nodeDisplay(l)
#define SO_printf(s) printf(s)
#define SO1_printf(s, p) printf(s, p)
+#define SO2_printf(s, p1, p2) printf(s, p1, p2)
#else
#define SO_nodeDisplay(l)
#define SO_printf(s)
#define SO1_printf(s, p)
+#define SO2_printf(s, p1, p2)
#endif /* EXEC_SORTDEBUG */
/* ----------------
diff --git a/src/include/executor/nodeIncrementalSort.h b/src/include/executor/nodeIncrementalSort.h
new file mode 100644
index 0000000000..e62c02a4f3
--- /dev/null
+++ b/src/include/executor/nodeIncrementalSort.h
@@ -0,0 +1,28 @@
+/*-------------------------------------------------------------------------
+ *
+ * nodeIncrementalSort.h
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/executor/nodeIncrementalSort.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef NODEINCREMENTALSORT_H
+#define NODEINCREMENTALSORT_H
+
+#include "access/parallel.h"
+#include "nodes/execnodes.h"
+
+extern IncrementalSortState *ExecInitIncrementalSort(IncrementalSort *node, EState *estate, int eflags);
+extern void ExecEndIncrementalSort(IncrementalSortState *node);
+extern void ExecReScanIncrementalSort(IncrementalSortState *node);
+
+/* parallel instrumentation support */
+extern void ExecIncrementalSortEstimate(IncrementalSortState *node, ParallelContext *pcxt);
+extern void ExecIncrementalSortInitializeDSM(IncrementalSortState *node, ParallelContext *pcxt);
+extern void ExecIncrementalSortInitializeWorker(IncrementalSortState *node, ParallelWorkerContext *pcxt);
+extern void ExecIncrementalSortRetrieveInstrumentation(IncrementalSortState *node);
+
+#endif /* NODEINCREMENTALSORT_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 0fb5d61a3f..fb490b404c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1982,6 +1982,21 @@ typedef struct MaterialState
Tuplestorestate *tuplestorestate;
} MaterialState;
+
+/* ----------------
+ * When performing sorting by multiple keys, it's possible that the input
+ * dataset is already sorted on a prefix of those keys. We call these
+ * "presorted keys".
+ * PresortedKeyData represents information about one such key.
+ * ----------------
+ */
+typedef struct PresortedKeyData
+{
+ FmgrInfo flinfo; /* comparison function info */
+ FunctionCallInfo fcinfo; /* comparison function call info */
+ OffsetNumber attno; /* attribute number in tuple */
+} PresortedKeyData;
+
/* ----------------
* Shared memory container for per-worker sort information
* ----------------
@@ -2010,6 +2025,71 @@ typedef struct SortState
SharedSortInfo *shared_info; /* one entry per worker */
} SortState;
+/* ----------------
+ * Instrumentation information for IncrementalSort
+ * ----------------
+ */
+typedef struct IncrementalSortGroupInfo
+{
+ int64 groupCount;
+ long maxDiskSpaceUsed;
+ long totalDiskSpaceUsed;
+ long maxMemorySpaceUsed;
+ long totalMemorySpaceUsed;
+ bits32 sortMethods; /* bitmask of TuplesortMethod */
+} IncrementalSortGroupInfo;
+
+typedef struct IncrementalSortInfo
+{
+ IncrementalSortGroupInfo fullsortGroupInfo;
+ IncrementalSortGroupInfo prefixsortGroupInfo;
+} IncrementalSortInfo;
+
+/* ----------------
+ * Shared memory container for per-worker incremental sort information
+ * ----------------
+ */
+typedef struct SharedIncrementalSortInfo
+{
+ int num_workers;
+ IncrementalSortInfo sinfo[FLEXIBLE_ARRAY_MEMBER];
+} SharedIncrementalSortInfo;
+
+/* ----------------
+ * IncrementalSortState information
+ * ----------------
+ */
+typedef enum
+{
+ INCSORT_LOADFULLSORT,
+ INCSORT_LOADPREFIXSORT,
+ INCSORT_READFULLSORT,
+ INCSORT_READPREFIXSORT,
+} IncrementalSortExecutionStatus;
+
+typedef struct IncrementalSortState
+{
+ ScanState ss; /* its first field is NodeTag */
+ bool bounded; /* is the result set bounded? */
+ int64 bound; /* if bounded, how many tuples are needed */
+ bool outerNodeDone; /* finished fetching tuples from outer node */
+ int64 bound_Done; /* value of bound we did the sort with */
+ IncrementalSortExecutionStatus execution_status;
+ int64 n_fullsort_remaining;
+ Tuplesortstate *fullsort_state; /* private state of tuplesort.c */
+ Tuplesortstate *prefixsort_state; /* private state of tuplesort.c */
+ /* the keys by which the input path is already sorted */
+ PresortedKeyData *presorted_keys;
+
+ IncrementalSortInfo incsort_info;
+
+ /* slot for pivot tuple defining values of presorted keys within group */
+ TupleTableSlot *group_pivot;
+ TupleTableSlot *transfer_tuple;
+ bool am_worker; /* are we a worker? */
+ SharedIncrementalSortInfo *shared_info; /* one entry per worker */
+} IncrementalSortState;
+
/* ---------------------
* GroupState information
* ---------------------
diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h
index 8a76afe8cc..50b1ba5186 100644
--- a/src/include/nodes/nodes.h
+++ b/src/include/nodes/nodes.h
@@ -74,6 +74,7 @@ typedef enum NodeTag
T_HashJoin,
T_Material,
T_Sort,
+ T_IncrementalSort,
T_Group,
T_Agg,
T_WindowAgg,
@@ -130,6 +131,7 @@ typedef enum NodeTag
T_HashJoinState,
T_MaterialState,
T_SortState,
+ T_IncrementalSortState,
T_GroupState,
T_AggState,
T_WindowAggState,
@@ -245,6 +247,7 @@ typedef enum NodeTag
T_ProjectionPath,
T_ProjectSetPath,
T_SortPath,
+ T_IncrementalSortPath,
T_GroupPath,
T_UpperUniquePath,
T_AggPath,
diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h
index 469c686e3f..a6d206b25a 100644
--- a/src/include/nodes/pathnodes.h
+++ b/src/include/nodes/pathnodes.h
@@ -1639,6 +1639,15 @@ typedef struct SortPath
} SortPath;
/*
+ * IncrementalSortPath
+ */
+typedef struct IncrementalSortPath
+{
+ SortPath spath;
+ int nPresortedCols; /* number of presorted columns */
+} IncrementalSortPath;
+
+/*
* GroupPath represents grouping (of presorted input)
*
* groupClause represents the columns to be grouped on; the input path
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 4869fe7b6d..be8ef54a1e 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -774,6 +774,16 @@ typedef struct Sort
bool *nullsFirst; /* NULLS FIRST/LAST directions */
} Sort;
+/* ----------------
+ * incremental sort node
+ * ----------------
+ */
+typedef struct IncrementalSort
+{
+ Sort sort;
+ int nPresortedCols; /* number of presorted columns */
+} IncrementalSort;
+
/* ---------------
* group node -
* Used for queries with GROUP BY (but no aggregates) specified.
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 735ba09650..9710e5c0a4 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -53,6 +53,7 @@ extern PGDLLIMPORT bool enable_indexonlyscan;
extern PGDLLIMPORT bool enable_bitmapscan;
extern PGDLLIMPORT bool enable_tidscan;
extern PGDLLIMPORT bool enable_sort;
+extern PGDLLIMPORT bool enable_incrementalsort;
extern PGDLLIMPORT bool enable_hashagg;
extern PGDLLIMPORT bool enable_hashagg_disk;
extern PGDLLIMPORT bool enable_groupingsets_hash_disk;
@@ -103,6 +104,11 @@ extern void cost_sort(Path *path, PlannerInfo *root,
List *pathkeys, Cost input_cost, double tuples, int width,
Cost comparison_cost, int sort_mem,
double limit_tuples);
+extern void cost_incremental_sort(Path *path,
+ PlannerInfo *root, List *pathkeys, int presorted_keys,
+ Cost input_startup_cost, Cost input_total_cost,
+ double input_tuples, int width, Cost comparison_cost, int sort_mem,
+ double limit_tuples);
extern void cost_append(AppendPath *path);
extern void cost_merge_append(Path *path, PlannerInfo *root,
List *pathkeys, int n_streams,
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index e450fe112a..bcd08af753 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -184,6 +184,12 @@ extern ProjectSetPath *create_set_projection_path(PlannerInfo *root,
RelOptInfo *rel,
Path *subpath,
PathTarget *target);
+extern SortPath *create_incremental_sort_path(PlannerInfo *root,
+ RelOptInfo *rel,
+ Path *subpath,
+ List *pathkeys,
+ int presorted_keys,
+ double limit_tuples);
extern SortPath *create_sort_path(PlannerInfo *root,
RelOptInfo *rel,
Path *subpath,
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index c689fe8e26..ab61f306cb 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -185,6 +185,7 @@ typedef enum
extern PathKeysComparison compare_pathkeys(List *keys1, List *keys2);
extern bool pathkeys_contained_in(List *keys1, List *keys2);
+extern bool pathkeys_count_contained_in(List *keys1, List *keys2, int *n_common);
extern Path *get_cheapest_path_for_pathkeys(List *paths, List *pathkeys,
Relids required_outer,
CostSelector cost_criterion,
diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h
index a2fdd3fcd3..8d00a9e501 100644
--- a/src/include/utils/tuplesort.h
+++ b/src/include/utils/tuplesort.h
@@ -61,14 +61,17 @@ typedef struct SortCoordinateData *SortCoordinate;
* Data structures for reporting sort statistics. Note that
* TuplesortInstrumentation can't contain any pointers because we
* sometimes put it in shared memory.
+ *
+ * TuplesortMethod is used in a bitmask in Increment Sort's shared memory
+ * instrumentation so needs to have each value be a separate bit.
*/
typedef enum
{
- SORT_TYPE_STILL_IN_PROGRESS = 0,
- SORT_TYPE_TOP_N_HEAPSORT,
- SORT_TYPE_QUICKSORT,
- SORT_TYPE_EXTERNAL_SORT,
- SORT_TYPE_EXTERNAL_MERGE
+ SORT_TYPE_STILL_IN_PROGRESS = 1 << 0,
+ SORT_TYPE_TOP_N_HEAPSORT = 1 << 1,
+ SORT_TYPE_QUICKSORT = 1 << 2,
+ SORT_TYPE_EXTERNAL_SORT = 1 << 3,
+ SORT_TYPE_EXTERNAL_MERGE = 1 << 4
} TuplesortMethod;
typedef enum
@@ -215,6 +218,7 @@ extern Tuplesortstate *tuplesort_begin_datum(Oid datumType,
bool randomAccess);
extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound);
+extern bool tuplesort_used_bound(Tuplesortstate *state);
extern void tuplesort_puttupleslot(Tuplesortstate *state,
TupleTableSlot *slot);
@@ -239,6 +243,8 @@ extern bool tuplesort_skiptuples(Tuplesortstate *state, int64 ntuples,
extern void tuplesort_end(Tuplesortstate *state);
+extern void tuplesort_reset(Tuplesortstate *state);
+
extern void tuplesort_get_stats(Tuplesortstate *state,
TuplesortInstrumentation *stats);
extern const char *tuplesort_method_name(TuplesortMethod m);