SERVER-74359 Tune t-digest settings

author: Irina Yatsenko <irina.yatsenko@mongodb.com> 2023-04-13 16:23:04 +0000
committer: Evergreen Agent <no-reply@evergreen.mongodb.com> 2023-04-13 20:49:05 +0000
commit: b52563099a666bd3e2d3a2cc1b85de736779ce1b (patch)
tree: 62db073a1beff759cfe5aa6b96ae9381c6125517
parent: 9064cf3001223375fed351d98bc70bc25f1a31db (diff)
download: mongo-b52563099a666bd3e2d3a2cc1b85de736779ce1b.tar.gz
3 files changed, 16 insertions, 11 deletions
diff --git a/src/mongo/db/pipeline/percentile_algo_tdigest.cpp b/src/mongo/db/pipeline/percentile_algo_tdigest.cpp
index acee12d99e3..887822f1297 100644
--- a/src/mongo/db/pipeline/percentile_algo_tdigest.cpp
+++ b/src/mongo/db/pipeline/percentile_algo_tdigest.cpp
@@ -117,10 +117,9 @@ void TDigest::flushBuffer() {
         return;
     }
 
-    // TODO SERVER-74359: 'boost::sort::spreadsort::spreadsort' shows an observable perf improvement
-    // over std::sort and potentially might provide even more benefits if we separate accumulated
-    // data by type, as it can do radix sort on integers. However, we don't currently include
-    // boost::sort into out third_party libs.
+    // TODO SERVER-75565: 'boost::sort::spreadsort::spreadsort' shows an observable perf improvement
+    // over std::sort on large datasets. If switching to boost's spreadsort would need to re-tune
+    // the default delta setting and the size of the buffer.
     std::sort(_buffer.begin(), _buffer.end());
     merge(_buffer);
     _buffer.clear();
@@ -157,9 +156,11 @@ boost::optional<double> TDigest::computePercentile(double p) {
     // contributed to.
     size_t i = 0;  // index of the target centroid
     double r = 0;  // cumulative weight of all centroids up to, and including, i_th one
-    // TODO SERVER-74359 (tune t-digest): is it worth optimizing traversing the set of centroids
-    // backwards for p > 0.5? This likely doesn't matter when TDigest is used by accumulator but
-    // might become noticeable in expressions.
+
+    // We are not optimizing traversing the set of centroids for higher percentiles or when
+    // multiple percentiles have been requested because our benchmarks don't show this to be a
+    // problem in the accumulator context, and for expressions, where it might matter, we are not
+    // using t-digest.
     for (; i < _centroids.size(); i++) {
         r += _centroids[i].weight;
         if (r > rank) {
diff --git a/src/mongo/db/pipeline/percentile_algo_tdigest.h b/src/mongo/db/pipeline/percentile_algo_tdigest.h
index d62de70c19d..a10bad00882 100644
--- a/src/mongo/db/pipeline/percentile_algo_tdigest.h
+++ b/src/mongo/db/pipeline/percentile_algo_tdigest.h
@@ -259,8 +259,8 @@ protected:
 
     // Buffer for the incoming inputs. When the buffer is full, the inputs are sorted and merged
     // into '_centroids'. The max size is set in constructors to bufferCoeff * delta. The
-    // coefficient has been determined empirically from micro-benchmarks.
-    static constexpr int bufferCoeff = 5;
+    // coefficient has been determined empirically from benchmarks.
+    static constexpr int bufferCoeff = 3;
     const size_t _maxBufferSize;
     std::vector<double> _buffer;
 
diff --git a/src/mongo/db/query/query_knobs.idl b/src/mongo/db/query/query_knobs.idl
index f63351f6b60..d7ab1320884 100644
--- a/src/mongo/db/query/query_knobs.idl
+++ b/src/mongo/db/query/query_knobs.idl
@@ -1147,11 +1147,15 @@ server_parameters:
     default: true
 
   internalQueryTdigestDelta:
-    description: "Compaction parameter the for t-digest algorithm."
+    description: "Compaction parameter the for t-digest algorithm. Increasing delta might improve
+     accuracy of the computed percentiles at the cost of using more memory (about 12KB per 1000 of
+     increase). Runtime of t-digest also depends on delta but non-linearly. The current default was
+     chosen empirically to yield good balance between runtime, memory consumption and accuracy on
+     most datasets."
     set_at: [ startup, runtime ]
     cpp_varname: internalQueryTdigestDelta
     cpp_vartype: AtomicWord<int>
-    default: 1000
+    default: 2000
     validator:
       gte: 0
       lte: 100000 # arbitrary, just to set an upper limit on the amount of memory used by t-digest
author	Irina Yatsenko <irina.yatsenko@mongodb.com>	2023-04-13 16:23:04 +0000
committer	Evergreen Agent <no-reply@evergreen.mongodb.com>	2023-04-13 20:49:05 +0000
commit	b52563099a666bd3e2d3a2cc1b85de736779ce1b (patch)
tree	62db073a1beff759cfe5aa6b96ae9381c6125517
parent	9064cf3001223375fed351d98bc70bc25f1a31db (diff)
download	mongo-b52563099a666bd3e2d3a2cc1b85de736779ce1b.tar.gz