diff options
author | Irina Yatsenko <irina.yatsenko@mongodb.com> | 2023-04-13 16:23:04 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-04-13 20:49:05 +0000 |
commit | b52563099a666bd3e2d3a2cc1b85de736779ce1b (patch) | |
tree | 62db073a1beff759cfe5aa6b96ae9381c6125517 | |
parent | 9064cf3001223375fed351d98bc70bc25f1a31db (diff) | |
download | mongo-b52563099a666bd3e2d3a2cc1b85de736779ce1b.tar.gz |
SERVER-74359 Tune t-digest settings
-rw-r--r-- | src/mongo/db/pipeline/percentile_algo_tdigest.cpp | 15 | ||||
-rw-r--r-- | src/mongo/db/pipeline/percentile_algo_tdigest.h | 4 | ||||
-rw-r--r-- | src/mongo/db/query/query_knobs.idl | 8 |
3 files changed, 16 insertions, 11 deletions
diff --git a/src/mongo/db/pipeline/percentile_algo_tdigest.cpp b/src/mongo/db/pipeline/percentile_algo_tdigest.cpp index acee12d99e3..887822f1297 100644 --- a/src/mongo/db/pipeline/percentile_algo_tdigest.cpp +++ b/src/mongo/db/pipeline/percentile_algo_tdigest.cpp @@ -117,10 +117,9 @@ void TDigest::flushBuffer() { return; } - // TODO SERVER-74359: 'boost::sort::spreadsort::spreadsort' shows an observable perf improvement - // over std::sort and potentially might provide even more benefits if we separate accumulated - // data by type, as it can do radix sort on integers. However, we don't currently include - // boost::sort into out third_party libs. + // TODO SERVER-75565: 'boost::sort::spreadsort::spreadsort' shows an observable perf improvement + // over std::sort on large datasets. If switching to boost's spreadsort would need to re-tune + // the default delta setting and the size of the buffer. std::sort(_buffer.begin(), _buffer.end()); merge(_buffer); _buffer.clear(); @@ -157,9 +156,11 @@ boost::optional<double> TDigest::computePercentile(double p) { // contributed to. size_t i = 0; // index of the target centroid double r = 0; // cumulative weight of all centroids up to, and including, i_th one - // TODO SERVER-74359 (tune t-digest): is it worth optimizing traversing the set of centroids - // backwards for p > 0.5? This likely doesn't matter when TDigest is used by accumulator but - // might become noticeable in expressions. + + // We are not optimizing traversing the set of centroids for higher percentiles or when + // multiple percentiles have been requested because our benchmarks don't show this to be a + // problem in the accumulator context, and for expressions, where it might matter, we are not + // using t-digest. for (; i < _centroids.size(); i++) { r += _centroids[i].weight; if (r > rank) { diff --git a/src/mongo/db/pipeline/percentile_algo_tdigest.h b/src/mongo/db/pipeline/percentile_algo_tdigest.h index d62de70c19d..a10bad00882 100644 --- a/src/mongo/db/pipeline/percentile_algo_tdigest.h +++ b/src/mongo/db/pipeline/percentile_algo_tdigest.h @@ -259,8 +259,8 @@ protected: // Buffer for the incoming inputs. When the buffer is full, the inputs are sorted and merged // into '_centroids'. The max size is set in constructors to bufferCoeff * delta. The - // coefficient has been determined empirically from micro-benchmarks. - static constexpr int bufferCoeff = 5; + // coefficient has been determined empirically from benchmarks. + static constexpr int bufferCoeff = 3; const size_t _maxBufferSize; std::vector<double> _buffer; diff --git a/src/mongo/db/query/query_knobs.idl b/src/mongo/db/query/query_knobs.idl index f63351f6b60..d7ab1320884 100644 --- a/src/mongo/db/query/query_knobs.idl +++ b/src/mongo/db/query/query_knobs.idl @@ -1147,11 +1147,15 @@ server_parameters: default: true internalQueryTdigestDelta: - description: "Compaction parameter the for t-digest algorithm." + description: "Compaction parameter the for t-digest algorithm. Increasing delta might improve + accuracy of the computed percentiles at the cost of using more memory (about 12KB per 1000 of + increase). Runtime of t-digest also depends on delta but non-linearly. The current default was + chosen empirically to yield good balance between runtime, memory consumption and accuracy on + most datasets." set_at: [ startup, runtime ] cpp_varname: internalQueryTdigestDelta cpp_vartype: AtomicWord<int> - default: 1000 + default: 2000 validator: gte: 0 lte: 100000 # arbitrary, just to set an upper limit on the amount of memory used by t-digest |